chupa-text 1.1.4 → 1.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/chupa-text.gemspec +3 -2
- data/doc/text/news.md +6 -0
- data/lib/chupa-text/decomposers/office-open-xml-workbook.rb +1 -3
- data/lib/chupa-text/decomposers/office-open-xml.rb +7 -15
- data/lib/chupa-text/decomposers/opendocument-presentation.rb +2 -4
- data/lib/chupa-text/decomposers/opendocument-spreadsheet.rb +2 -4
- data/lib/chupa-text/decomposers/opendocument-text.rb +2 -4
- data/lib/chupa-text/decomposers/opendocument.rb +5 -12
- data/lib/chupa-text/sax-parser.rb +151 -0
- data/lib/chupa-text/version.rb +2 -2
- metadata +20 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 65f2eb5be135f3db4a0cbb81bc4a268bcb752934da6d9aa32740842bb4cc674f
|
4
|
+
data.tar.gz: 72f8487dc57b67293e001621f6b63ece4680cf1069d9f98e3daa89708571766b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 32149617d9c921de856aa9fb3856a8a29fde5e9e1872a16241235499b1b8e014941928fb040ad60d52c15cbc4b69d84a9860bd432074964865e7f1e85d673a1e
|
7
|
+
data.tar.gz: 85c2820468547c9b34fef27faa2db0fc9d8208a266949177b2e4f2269bfadcd8f5b534c0cdb4aff9e28360c1509dd6b2d06ef72c7d1e268f399e97595a133a2c
|
data/chupa-text.gemspec
CHANGED
@@ -53,8 +53,9 @@ Gem::Specification.new do |spec|
|
|
53
53
|
spec.add_runtime_dependency("archive-zip")
|
54
54
|
|
55
55
|
spec.add_development_dependency("bundler")
|
56
|
-
spec.add_development_dependency("
|
57
|
-
spec.add_development_dependency("test-unit")
|
56
|
+
spec.add_development_dependency("nokogiri")
|
58
57
|
spec.add_development_dependency("packnga")
|
58
|
+
spec.add_development_dependency("rake")
|
59
59
|
spec.add_development_dependency("redcarpet")
|
60
|
+
spec.add_development_dependency("test-unit")
|
60
61
|
end
|
data/doc/text/news.md
CHANGED
@@ -14,12 +14,10 @@
|
|
14
14
|
# License along with this library; if not, write to the Free Software
|
15
15
|
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
16
16
|
|
17
|
-
require "cgi/util"
|
18
|
-
require "rexml/parsers/sax2parser"
|
19
|
-
require "rexml/sax2listener"
|
20
|
-
|
21
17
|
require "archive/zip"
|
22
18
|
|
19
|
+
require "chupa-text/sax-parser"
|
20
|
+
|
23
21
|
module ChupaText
|
24
22
|
module Decomposers
|
25
23
|
class OfficeOpenXML < Decomposer
|
@@ -67,10 +65,8 @@ module ChupaText
|
|
67
65
|
end
|
68
66
|
|
69
67
|
private
|
70
|
-
def parse(
|
71
|
-
|
72
|
-
parser = REXML::Parsers::SAX2Parser.new(source)
|
73
|
-
parser.listen(listener)
|
68
|
+
def parse(input, listener)
|
69
|
+
parser = SAXParser.new(input, listener)
|
74
70
|
parser.parse
|
75
71
|
end
|
76
72
|
|
@@ -83,9 +79,7 @@ module ChupaText
|
|
83
79
|
context[:text]
|
84
80
|
end
|
85
81
|
|
86
|
-
class TextListener
|
87
|
-
include REXML::SAX2Listener
|
88
|
-
|
82
|
+
class TextListener < SAXListener
|
89
83
|
def initialize(output, target_uri)
|
90
84
|
@output = output
|
91
85
|
@target_uri = target_uri
|
@@ -121,13 +115,11 @@ module ChupaText
|
|
121
115
|
private
|
122
116
|
def add_text(text)
|
123
117
|
return unless @in_target
|
124
|
-
@output <<
|
118
|
+
@output << text
|
125
119
|
end
|
126
120
|
end
|
127
121
|
|
128
|
-
class AttributesListener
|
129
|
-
include REXML::SAX2Listener
|
130
|
-
|
122
|
+
class AttributesListener < SAXListener
|
131
123
|
CORE_PROPERTIES_URI =
|
132
124
|
"http://schemas.openxmlformats.org/package/2006/metadata/core-properties"
|
133
125
|
EXTENDED_PROPERTIES_URI =
|
@@ -49,9 +49,7 @@ module ChupaText
|
|
49
49
|
end
|
50
50
|
end
|
51
51
|
|
52
|
-
class SlidesListener
|
53
|
-
include REXML::SAX2Listener
|
54
|
-
|
52
|
+
class SlidesListener < SAXListener
|
55
53
|
TEXT_URI = "urn:oasis:names:tc:opendocument:xmlns:text:1.0"
|
56
54
|
DRAW_URI = "urn:oasis:names:tc:opendocument:xmlns:drawing:1.0"
|
57
55
|
|
@@ -97,7 +95,7 @@ module ChupaText
|
|
97
95
|
private
|
98
96
|
def add_text(text)
|
99
97
|
return unless @in_p
|
100
|
-
@slides.last[:text] <<
|
98
|
+
@slides.last[:text] << text
|
101
99
|
end
|
102
100
|
end
|
103
101
|
end
|
@@ -51,9 +51,7 @@ module ChupaText
|
|
51
51
|
end
|
52
52
|
end
|
53
53
|
|
54
|
-
class SheetsListener
|
55
|
-
include REXML::SAX2Listener
|
56
|
-
|
54
|
+
class SheetsListener < SAXListener
|
57
55
|
TEXT_URI = "urn:oasis:names:tc:opendocument:xmlns:text:1.0"
|
58
56
|
TABLE_URI = "urn:oasis:names:tc:opendocument:xmlns:table:1.0"
|
59
57
|
|
@@ -126,7 +124,7 @@ module ChupaText
|
|
126
124
|
private
|
127
125
|
def add_text(text)
|
128
126
|
return unless @in_p
|
129
|
-
@sheets.last[:rows].last.last[:text] <<
|
127
|
+
@sheets.last[:rows].last.last[:text] << text
|
130
128
|
end
|
131
129
|
end
|
132
130
|
end
|
@@ -43,9 +43,7 @@ module ChupaText
|
|
43
43
|
yield(text_data)
|
44
44
|
end
|
45
45
|
|
46
|
-
class TextListener
|
47
|
-
include REXML::SAX2Listener
|
48
|
-
|
46
|
+
class TextListener < SAXListener
|
49
47
|
TEXT_URI = "urn:oasis:names:tc:opendocument:xmlns:text:1.0"
|
50
48
|
def initialize(output)
|
51
49
|
@output = output
|
@@ -81,7 +79,7 @@ module ChupaText
|
|
81
79
|
private
|
82
80
|
def add_text(text)
|
83
81
|
return unless @in_p
|
84
|
-
@output <<
|
82
|
+
@output << text
|
85
83
|
end
|
86
84
|
end
|
87
85
|
end
|
@@ -14,12 +14,10 @@
|
|
14
14
|
# License along with this library; if not, write to the Free Software
|
15
15
|
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
16
16
|
|
17
|
-
require "cgi/util"
|
18
|
-
require "rexml/parsers/sax2parser"
|
19
|
-
require "rexml/sax2listener"
|
20
|
-
|
21
17
|
require "archive/zip"
|
22
18
|
|
19
|
+
require "chupa-text/sax-parser"
|
20
|
+
|
23
21
|
module ChupaText
|
24
22
|
module Decomposers
|
25
23
|
class OpenDocument < Decomposer
|
@@ -58,10 +56,8 @@ module ChupaText
|
|
58
56
|
end
|
59
57
|
|
60
58
|
private
|
61
|
-
def parse(
|
62
|
-
|
63
|
-
parser = REXML::Parsers::SAX2Parser.new(source)
|
64
|
-
parser.listen(listener)
|
59
|
+
def parse(input, listener)
|
60
|
+
parser = SAXParser.new(input, listener)
|
65
61
|
parser.parse
|
66
62
|
end
|
67
63
|
|
@@ -70,9 +66,7 @@ module ChupaText
|
|
70
66
|
parse(entry.file_data, listener)
|
71
67
|
end
|
72
68
|
|
73
|
-
class AttributesListener
|
74
|
-
include REXML::SAX2Listener
|
75
|
-
|
69
|
+
class AttributesListener < SAXListener
|
76
70
|
META_URI = "urn:oasis:names:tc:opendocument:xmlns:meta:1.0"
|
77
71
|
DUBLIN_CORE_URI = "http://purl.org/dc/elements/1.1/"
|
78
72
|
|
@@ -122,7 +116,6 @@ module ChupaText
|
|
122
116
|
def set_attribute(value)
|
123
117
|
return if @name.nil?
|
124
118
|
|
125
|
-
value = CGI.unescapeHTML(value)
|
126
119
|
case @type
|
127
120
|
when :w3cdtf
|
128
121
|
value = Time.xmlschema(value)
|
@@ -0,0 +1,151 @@
|
|
1
|
+
# Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# This library is free software; you can redistribute it and/or
|
4
|
+
# modify it under the terms of the GNU Lesser General Public
|
5
|
+
# License as published by the Free Software Foundation; either
|
6
|
+
# version 2.1 of the License, or (at your option) any later version.
|
7
|
+
#
|
8
|
+
# This library is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
11
|
+
# Lesser General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU Lesser General Public
|
14
|
+
# License along with this library; if not, write to the Free Software
|
15
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
16
|
+
|
17
|
+
require "cgi/util"
|
18
|
+
require "rexml/parsers/sax2parser"
|
19
|
+
require "rexml/sax2listener"
|
20
|
+
|
21
|
+
begin
|
22
|
+
require "nokogiri"
|
23
|
+
rescue LoadError
|
24
|
+
end
|
25
|
+
|
26
|
+
module ChupaText
|
27
|
+
class SAXParser
|
28
|
+
class << self
|
29
|
+
def backend
|
30
|
+
case ENV["CHUPA_TEXT_SAX_PARSER_BACKEND"]
|
31
|
+
when "rexml"
|
32
|
+
:rexml
|
33
|
+
else
|
34
|
+
if Object.const_defined?(:Nokogiri)
|
35
|
+
:nokogiri
|
36
|
+
else
|
37
|
+
:rexml
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def initialize(input, listener)
|
44
|
+
@input = input
|
45
|
+
@listener = listener
|
46
|
+
end
|
47
|
+
|
48
|
+
if backend == :nokogiri
|
49
|
+
def parse
|
50
|
+
document = Document.new(@listener)
|
51
|
+
parser = Nokogiri::XML::SAX::Parser.new(document)
|
52
|
+
parser.parse(@input)
|
53
|
+
end
|
54
|
+
|
55
|
+
class Document < Nokogiri::XML::SAX::Document
|
56
|
+
def initialize(listener)
|
57
|
+
@listener = listener
|
58
|
+
@namespaces_stack = []
|
59
|
+
end
|
60
|
+
|
61
|
+
def start_element_namespace(name,
|
62
|
+
attributes=[],
|
63
|
+
prefix=nil,
|
64
|
+
uri=nil,
|
65
|
+
namespaces=[])
|
66
|
+
namespaces.each do |namespace_prefix, namespace_uri|
|
67
|
+
@listener.start_prefix_mapping(namespace_prefix, namespace_uri)
|
68
|
+
end
|
69
|
+
attributes_hash = {}
|
70
|
+
attributes.each do |attribute|
|
71
|
+
attribute_qname = build_qname(attribute.prefix, attribute.localname)
|
72
|
+
attributes_hash[attribute_qname] = attribute.value
|
73
|
+
end
|
74
|
+
@namespaces_stack.push(namespaces)
|
75
|
+
@listener.start_element(uri,
|
76
|
+
name,
|
77
|
+
build_qname(prefix, name),
|
78
|
+
attributes_hash)
|
79
|
+
end
|
80
|
+
|
81
|
+
def end_element_namespace(name, prefix=nil, uri=nil)
|
82
|
+
@listener.end_element(uri, name, build_qname(prefix, name))
|
83
|
+
namespaces = @namespaces_stack.pop
|
84
|
+
namespaces.each do |namespace_prefix, _|
|
85
|
+
@listener.end_prefix_mapping(namespace_prefix)
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def characters(text)
|
90
|
+
@listener.characters(text)
|
91
|
+
end
|
92
|
+
|
93
|
+
def cdata_block(content)
|
94
|
+
@listener.cdata(content)
|
95
|
+
end
|
96
|
+
|
97
|
+
private
|
98
|
+
def build_qname(prefix, local_name)
|
99
|
+
if prefix
|
100
|
+
"#{prefix}:#{local_name}"
|
101
|
+
else
|
102
|
+
local_name
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
else
|
107
|
+
def parse
|
108
|
+
source = REXML::Source.new(@input.read)
|
109
|
+
parser = REXML::Parsers::SAX2Parser.new(source)
|
110
|
+
parser.listen(Listener.new(@listener))
|
111
|
+
parser.parse
|
112
|
+
end
|
113
|
+
|
114
|
+
class Listener
|
115
|
+
include REXML::SAX2Listener
|
116
|
+
|
117
|
+
def initialize(listener)
|
118
|
+
@listener = listener
|
119
|
+
end
|
120
|
+
|
121
|
+
def start_prefix_mapping(*args)
|
122
|
+
@listener.start_prefix_mapping(*args)
|
123
|
+
end
|
124
|
+
|
125
|
+
def end_prefix_mapping(*args)
|
126
|
+
@listener.end_prefix_mapping(*args)
|
127
|
+
end
|
128
|
+
|
129
|
+
def start_element(*args)
|
130
|
+
@listener.start_element(*args)
|
131
|
+
end
|
132
|
+
|
133
|
+
def end_element(*args)
|
134
|
+
@listener.end_element(*args)
|
135
|
+
end
|
136
|
+
|
137
|
+
def characters(text)
|
138
|
+
@listener.characters(CGI.unescapeHTML(text))
|
139
|
+
end
|
140
|
+
|
141
|
+
def cdata(content)
|
142
|
+
@listener.cdata(CGI.unescapeHTML(content))
|
143
|
+
end
|
144
|
+
end
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
class SAXListener
|
149
|
+
include REXML::SAX2Listener
|
150
|
+
end
|
151
|
+
end
|
data/lib/chupa-text/version.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright (C) 2013-
|
1
|
+
# Copyright (C) 2013-2019 Kouhei Sutou <kou@clear-code.com>
|
2
2
|
#
|
3
3
|
# This library is free software; you can redistribute it and/or
|
4
4
|
# modify it under the terms of the GNU Lesser General Public
|
@@ -15,5 +15,5 @@
|
|
15
15
|
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
16
16
|
|
17
17
|
module ChupaText
|
18
|
-
VERSION = "1.1.
|
18
|
+
VERSION = "1.1.5"
|
19
19
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: chupa-text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.1.
|
4
|
+
version: 1.1.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kouhei Sutou
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-02-
|
11
|
+
date: 2019-02-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: archive-zip
|
@@ -39,7 +39,7 @@ dependencies:
|
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
42
|
+
name: nokogiri
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
45
|
- - ">="
|
@@ -53,7 +53,7 @@ dependencies:
|
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '0'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
|
-
name:
|
56
|
+
name: packnga
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
59
|
- - ">="
|
@@ -67,7 +67,7 @@ dependencies:
|
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '0'
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
|
-
name:
|
70
|
+
name: rake
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
72
72
|
requirements:
|
73
73
|
- - ">="
|
@@ -94,6 +94,20 @@ dependencies:
|
|
94
94
|
- - ">="
|
95
95
|
- !ruby/object:Gem::Version
|
96
96
|
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: test-unit
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
97
111
|
description: ''
|
98
112
|
email:
|
99
113
|
- kou@clear-code.com
|
@@ -156,6 +170,7 @@ files:
|
|
156
170
|
- lib/chupa-text/logger.rb
|
157
171
|
- lib/chupa-text/mime-type-registry.rb
|
158
172
|
- lib/chupa-text/mime-type.rb
|
173
|
+
- lib/chupa-text/sax-parser.rb
|
159
174
|
- lib/chupa-text/screenshot.rb
|
160
175
|
- lib/chupa-text/size-parser.rb
|
161
176
|
- lib/chupa-text/text-data.rb
|