chupa-text 1.1.4 → 1.1.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/chupa-text.gemspec +3 -2
- data/doc/text/news.md +6 -0
- data/lib/chupa-text/decomposers/office-open-xml-workbook.rb +1 -3
- data/lib/chupa-text/decomposers/office-open-xml.rb +7 -15
- data/lib/chupa-text/decomposers/opendocument-presentation.rb +2 -4
- data/lib/chupa-text/decomposers/opendocument-spreadsheet.rb +2 -4
- data/lib/chupa-text/decomposers/opendocument-text.rb +2 -4
- data/lib/chupa-text/decomposers/opendocument.rb +5 -12
- data/lib/chupa-text/sax-parser.rb +151 -0
- data/lib/chupa-text/version.rb +2 -2
- metadata +20 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 65f2eb5be135f3db4a0cbb81bc4a268bcb752934da6d9aa32740842bb4cc674f
|
4
|
+
data.tar.gz: 72f8487dc57b67293e001621f6b63ece4680cf1069d9f98e3daa89708571766b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 32149617d9c921de856aa9fb3856a8a29fde5e9e1872a16241235499b1b8e014941928fb040ad60d52c15cbc4b69d84a9860bd432074964865e7f1e85d673a1e
|
7
|
+
data.tar.gz: 85c2820468547c9b34fef27faa2db0fc9d8208a266949177b2e4f2269bfadcd8f5b534c0cdb4aff9e28360c1509dd6b2d06ef72c7d1e268f399e97595a133a2c
|
data/chupa-text.gemspec
CHANGED
@@ -53,8 +53,9 @@ Gem::Specification.new do |spec|
|
|
53
53
|
spec.add_runtime_dependency("archive-zip")
|
54
54
|
|
55
55
|
spec.add_development_dependency("bundler")
|
56
|
-
spec.add_development_dependency("
|
57
|
-
spec.add_development_dependency("test-unit")
|
56
|
+
spec.add_development_dependency("nokogiri")
|
58
57
|
spec.add_development_dependency("packnga")
|
58
|
+
spec.add_development_dependency("rake")
|
59
59
|
spec.add_development_dependency("redcarpet")
|
60
|
+
spec.add_development_dependency("test-unit")
|
60
61
|
end
|
data/doc/text/news.md
CHANGED
@@ -14,12 +14,10 @@
|
|
14
14
|
# License along with this library; if not, write to the Free Software
|
15
15
|
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
16
16
|
|
17
|
-
require "cgi/util"
|
18
|
-
require "rexml/parsers/sax2parser"
|
19
|
-
require "rexml/sax2listener"
|
20
|
-
|
21
17
|
require "archive/zip"
|
22
18
|
|
19
|
+
require "chupa-text/sax-parser"
|
20
|
+
|
23
21
|
module ChupaText
|
24
22
|
module Decomposers
|
25
23
|
class OfficeOpenXML < Decomposer
|
@@ -67,10 +65,8 @@ module ChupaText
|
|
67
65
|
end
|
68
66
|
|
69
67
|
private
|
70
|
-
def parse(
|
71
|
-
|
72
|
-
parser = REXML::Parsers::SAX2Parser.new(source)
|
73
|
-
parser.listen(listener)
|
68
|
+
def parse(input, listener)
|
69
|
+
parser = SAXParser.new(input, listener)
|
74
70
|
parser.parse
|
75
71
|
end
|
76
72
|
|
@@ -83,9 +79,7 @@ module ChupaText
|
|
83
79
|
context[:text]
|
84
80
|
end
|
85
81
|
|
86
|
-
class TextListener
|
87
|
-
include REXML::SAX2Listener
|
88
|
-
|
82
|
+
class TextListener < SAXListener
|
89
83
|
def initialize(output, target_uri)
|
90
84
|
@output = output
|
91
85
|
@target_uri = target_uri
|
@@ -121,13 +115,11 @@ module ChupaText
|
|
121
115
|
private
|
122
116
|
def add_text(text)
|
123
117
|
return unless @in_target
|
124
|
-
@output <<
|
118
|
+
@output << text
|
125
119
|
end
|
126
120
|
end
|
127
121
|
|
128
|
-
class AttributesListener
|
129
|
-
include REXML::SAX2Listener
|
130
|
-
|
122
|
+
class AttributesListener < SAXListener
|
131
123
|
CORE_PROPERTIES_URI =
|
132
124
|
"http://schemas.openxmlformats.org/package/2006/metadata/core-properties"
|
133
125
|
EXTENDED_PROPERTIES_URI =
|
@@ -49,9 +49,7 @@ module ChupaText
|
|
49
49
|
end
|
50
50
|
end
|
51
51
|
|
52
|
-
class SlidesListener
|
53
|
-
include REXML::SAX2Listener
|
54
|
-
|
52
|
+
class SlidesListener < SAXListener
|
55
53
|
TEXT_URI = "urn:oasis:names:tc:opendocument:xmlns:text:1.0"
|
56
54
|
DRAW_URI = "urn:oasis:names:tc:opendocument:xmlns:drawing:1.0"
|
57
55
|
|
@@ -97,7 +95,7 @@ module ChupaText
|
|
97
95
|
private
|
98
96
|
def add_text(text)
|
99
97
|
return unless @in_p
|
100
|
-
@slides.last[:text] <<
|
98
|
+
@slides.last[:text] << text
|
101
99
|
end
|
102
100
|
end
|
103
101
|
end
|
@@ -51,9 +51,7 @@ module ChupaText
|
|
51
51
|
end
|
52
52
|
end
|
53
53
|
|
54
|
-
class SheetsListener
|
55
|
-
include REXML::SAX2Listener
|
56
|
-
|
54
|
+
class SheetsListener < SAXListener
|
57
55
|
TEXT_URI = "urn:oasis:names:tc:opendocument:xmlns:text:1.0"
|
58
56
|
TABLE_URI = "urn:oasis:names:tc:opendocument:xmlns:table:1.0"
|
59
57
|
|
@@ -126,7 +124,7 @@ module ChupaText
|
|
126
124
|
private
|
127
125
|
def add_text(text)
|
128
126
|
return unless @in_p
|
129
|
-
@sheets.last[:rows].last.last[:text] <<
|
127
|
+
@sheets.last[:rows].last.last[:text] << text
|
130
128
|
end
|
131
129
|
end
|
132
130
|
end
|
@@ -43,9 +43,7 @@ module ChupaText
|
|
43
43
|
yield(text_data)
|
44
44
|
end
|
45
45
|
|
46
|
-
class TextListener
|
47
|
-
include REXML::SAX2Listener
|
48
|
-
|
46
|
+
class TextListener < SAXListener
|
49
47
|
TEXT_URI = "urn:oasis:names:tc:opendocument:xmlns:text:1.0"
|
50
48
|
def initialize(output)
|
51
49
|
@output = output
|
@@ -81,7 +79,7 @@ module ChupaText
|
|
81
79
|
private
|
82
80
|
def add_text(text)
|
83
81
|
return unless @in_p
|
84
|
-
@output <<
|
82
|
+
@output << text
|
85
83
|
end
|
86
84
|
end
|
87
85
|
end
|
@@ -14,12 +14,10 @@
|
|
14
14
|
# License along with this library; if not, write to the Free Software
|
15
15
|
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
16
16
|
|
17
|
-
require "cgi/util"
|
18
|
-
require "rexml/parsers/sax2parser"
|
19
|
-
require "rexml/sax2listener"
|
20
|
-
|
21
17
|
require "archive/zip"
|
22
18
|
|
19
|
+
require "chupa-text/sax-parser"
|
20
|
+
|
23
21
|
module ChupaText
|
24
22
|
module Decomposers
|
25
23
|
class OpenDocument < Decomposer
|
@@ -58,10 +56,8 @@ module ChupaText
|
|
58
56
|
end
|
59
57
|
|
60
58
|
private
|
61
|
-
def parse(
|
62
|
-
|
63
|
-
parser = REXML::Parsers::SAX2Parser.new(source)
|
64
|
-
parser.listen(listener)
|
59
|
+
def parse(input, listener)
|
60
|
+
parser = SAXParser.new(input, listener)
|
65
61
|
parser.parse
|
66
62
|
end
|
67
63
|
|
@@ -70,9 +66,7 @@ module ChupaText
|
|
70
66
|
parse(entry.file_data, listener)
|
71
67
|
end
|
72
68
|
|
73
|
-
class AttributesListener
|
74
|
-
include REXML::SAX2Listener
|
75
|
-
|
69
|
+
class AttributesListener < SAXListener
|
76
70
|
META_URI = "urn:oasis:names:tc:opendocument:xmlns:meta:1.0"
|
77
71
|
DUBLIN_CORE_URI = "http://purl.org/dc/elements/1.1/"
|
78
72
|
|
@@ -122,7 +116,6 @@ module ChupaText
|
|
122
116
|
def set_attribute(value)
|
123
117
|
return if @name.nil?
|
124
118
|
|
125
|
-
value = CGI.unescapeHTML(value)
|
126
119
|
case @type
|
127
120
|
when :w3cdtf
|
128
121
|
value = Time.xmlschema(value)
|
@@ -0,0 +1,151 @@
|
|
1
|
+
# Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# This library is free software; you can redistribute it and/or
|
4
|
+
# modify it under the terms of the GNU Lesser General Public
|
5
|
+
# License as published by the Free Software Foundation; either
|
6
|
+
# version 2.1 of the License, or (at your option) any later version.
|
7
|
+
#
|
8
|
+
# This library is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
11
|
+
# Lesser General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU Lesser General Public
|
14
|
+
# License along with this library; if not, write to the Free Software
|
15
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
16
|
+
|
17
|
+
require "cgi/util"
|
18
|
+
require "rexml/parsers/sax2parser"
|
19
|
+
require "rexml/sax2listener"
|
20
|
+
|
21
|
+
begin
|
22
|
+
require "nokogiri"
|
23
|
+
rescue LoadError
|
24
|
+
end
|
25
|
+
|
26
|
+
module ChupaText
|
27
|
+
class SAXParser
|
28
|
+
class << self
|
29
|
+
def backend
|
30
|
+
case ENV["CHUPA_TEXT_SAX_PARSER_BACKEND"]
|
31
|
+
when "rexml"
|
32
|
+
:rexml
|
33
|
+
else
|
34
|
+
if Object.const_defined?(:Nokogiri)
|
35
|
+
:nokogiri
|
36
|
+
else
|
37
|
+
:rexml
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def initialize(input, listener)
|
44
|
+
@input = input
|
45
|
+
@listener = listener
|
46
|
+
end
|
47
|
+
|
48
|
+
if backend == :nokogiri
|
49
|
+
def parse
|
50
|
+
document = Document.new(@listener)
|
51
|
+
parser = Nokogiri::XML::SAX::Parser.new(document)
|
52
|
+
parser.parse(@input)
|
53
|
+
end
|
54
|
+
|
55
|
+
class Document < Nokogiri::XML::SAX::Document
|
56
|
+
def initialize(listener)
|
57
|
+
@listener = listener
|
58
|
+
@namespaces_stack = []
|
59
|
+
end
|
60
|
+
|
61
|
+
def start_element_namespace(name,
|
62
|
+
attributes=[],
|
63
|
+
prefix=nil,
|
64
|
+
uri=nil,
|
65
|
+
namespaces=[])
|
66
|
+
namespaces.each do |namespace_prefix, namespace_uri|
|
67
|
+
@listener.start_prefix_mapping(namespace_prefix, namespace_uri)
|
68
|
+
end
|
69
|
+
attributes_hash = {}
|
70
|
+
attributes.each do |attribute|
|
71
|
+
attribute_qname = build_qname(attribute.prefix, attribute.localname)
|
72
|
+
attributes_hash[attribute_qname] = attribute.value
|
73
|
+
end
|
74
|
+
@namespaces_stack.push(namespaces)
|
75
|
+
@listener.start_element(uri,
|
76
|
+
name,
|
77
|
+
build_qname(prefix, name),
|
78
|
+
attributes_hash)
|
79
|
+
end
|
80
|
+
|
81
|
+
def end_element_namespace(name, prefix=nil, uri=nil)
|
82
|
+
@listener.end_element(uri, name, build_qname(prefix, name))
|
83
|
+
namespaces = @namespaces_stack.pop
|
84
|
+
namespaces.each do |namespace_prefix, _|
|
85
|
+
@listener.end_prefix_mapping(namespace_prefix)
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def characters(text)
|
90
|
+
@listener.characters(text)
|
91
|
+
end
|
92
|
+
|
93
|
+
def cdata_block(content)
|
94
|
+
@listener.cdata(content)
|
95
|
+
end
|
96
|
+
|
97
|
+
private
|
98
|
+
def build_qname(prefix, local_name)
|
99
|
+
if prefix
|
100
|
+
"#{prefix}:#{local_name}"
|
101
|
+
else
|
102
|
+
local_name
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
else
|
107
|
+
def parse
|
108
|
+
source = REXML::Source.new(@input.read)
|
109
|
+
parser = REXML::Parsers::SAX2Parser.new(source)
|
110
|
+
parser.listen(Listener.new(@listener))
|
111
|
+
parser.parse
|
112
|
+
end
|
113
|
+
|
114
|
+
class Listener
|
115
|
+
include REXML::SAX2Listener
|
116
|
+
|
117
|
+
def initialize(listener)
|
118
|
+
@listener = listener
|
119
|
+
end
|
120
|
+
|
121
|
+
def start_prefix_mapping(*args)
|
122
|
+
@listener.start_prefix_mapping(*args)
|
123
|
+
end
|
124
|
+
|
125
|
+
def end_prefix_mapping(*args)
|
126
|
+
@listener.end_prefix_mapping(*args)
|
127
|
+
end
|
128
|
+
|
129
|
+
def start_element(*args)
|
130
|
+
@listener.start_element(*args)
|
131
|
+
end
|
132
|
+
|
133
|
+
def end_element(*args)
|
134
|
+
@listener.end_element(*args)
|
135
|
+
end
|
136
|
+
|
137
|
+
def characters(text)
|
138
|
+
@listener.characters(CGI.unescapeHTML(text))
|
139
|
+
end
|
140
|
+
|
141
|
+
def cdata(content)
|
142
|
+
@listener.cdata(CGI.unescapeHTML(content))
|
143
|
+
end
|
144
|
+
end
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
class SAXListener
|
149
|
+
include REXML::SAX2Listener
|
150
|
+
end
|
151
|
+
end
|
data/lib/chupa-text/version.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright (C) 2013-
|
1
|
+
# Copyright (C) 2013-2019 Kouhei Sutou <kou@clear-code.com>
|
2
2
|
#
|
3
3
|
# This library is free software; you can redistribute it and/or
|
4
4
|
# modify it under the terms of the GNU Lesser General Public
|
@@ -15,5 +15,5 @@
|
|
15
15
|
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
16
16
|
|
17
17
|
module ChupaText
|
18
|
-
VERSION = "1.1.
|
18
|
+
VERSION = "1.1.5"
|
19
19
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: chupa-text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.1.
|
4
|
+
version: 1.1.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kouhei Sutou
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-02-
|
11
|
+
date: 2019-02-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: archive-zip
|
@@ -39,7 +39,7 @@ dependencies:
|
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
42
|
+
name: nokogiri
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
45
|
- - ">="
|
@@ -53,7 +53,7 @@ dependencies:
|
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '0'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
|
-
name:
|
56
|
+
name: packnga
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
59
|
- - ">="
|
@@ -67,7 +67,7 @@ dependencies:
|
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '0'
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
|
-
name:
|
70
|
+
name: rake
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
72
72
|
requirements:
|
73
73
|
- - ">="
|
@@ -94,6 +94,20 @@ dependencies:
|
|
94
94
|
- - ">="
|
95
95
|
- !ruby/object:Gem::Version
|
96
96
|
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: test-unit
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
97
111
|
description: ''
|
98
112
|
email:
|
99
113
|
- kou@clear-code.com
|
@@ -156,6 +170,7 @@ files:
|
|
156
170
|
- lib/chupa-text/logger.rb
|
157
171
|
- lib/chupa-text/mime-type-registry.rb
|
158
172
|
- lib/chupa-text/mime-type.rb
|
173
|
+
- lib/chupa-text/sax-parser.rb
|
159
174
|
- lib/chupa-text/screenshot.rb
|
160
175
|
- lib/chupa-text/size-parser.rb
|
161
176
|
- lib/chupa-text/text-data.rb
|