chupa-text 1.1.3 → 1.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/doc/text/news.md +12 -0
- data/lib/chupa-text/command/chupa-text.rb +7 -1
- data/lib/chupa-text/decomposer.rb +8 -0
- data/lib/chupa-text/decomposers/office-open-xml-document.rb +51 -0
- data/lib/chupa-text/decomposers/office-open-xml-presentation.rb +67 -0
- data/lib/chupa-text/decomposers/office-open-xml-workbook.rb +114 -0
- data/lib/chupa-text/decomposers/office-open-xml.rb +196 -0
- data/lib/chupa-text/decomposers/opendocument-presentation.rb +105 -0
- data/lib/chupa-text/decomposers/opendocument-spreadsheet.rb +134 -0
- data/lib/chupa-text/decomposers/opendocument-text.rb +89 -0
- data/lib/chupa-text/decomposers/opendocument.rb +139 -0
- data/lib/chupa-text/extractor.rb +8 -2
- data/lib/chupa-text/formatters/mime.rb +3 -2
- data/lib/chupa-text/version.rb +1 -1
- data/test/decomposers/test-office-open-xml-document.rb +144 -0
- data/test/decomposers/test-office-open-xml-presentation.rb +133 -0
- data/test/decomposers/test-office-open-xml-workbook.rb +138 -0
- data/test/decomposers/test-open-document-presentation.rb +136 -0
- data/test/decomposers/test-open-document-spreadsheet.rb +152 -0
- data/test/decomposers/test-open-document-text.rb +144 -0
- data/test/fixture/docx/attributes.docx +0 -0
- data/test/fixture/docx/multi-pages.docx +0 -0
- data/test/fixture/docx/one-page.docx +0 -0
- data/test/fixture/docx/special-characters.docx +0 -0
- data/test/fixture/odp/attributes.odp +0 -0
- data/test/fixture/odp/multi-slides.odp +0 -0
- data/test/fixture/odp/one-slide.odp +0 -0
- data/test/fixture/ods/attributes.ods +0 -0
- data/test/fixture/ods/multi-sheets.ods +0 -0
- data/test/fixture/ods/one-sheet.ods +0 -0
- data/test/fixture/odt/attributes.odt +0 -0
- data/test/fixture/odt/multi-pages.odt +0 -0
- data/test/fixture/odt/one-page.odt +0 -0
- data/test/fixture/odt/special-characters.odt +0 -0
- data/test/fixture/pptx/attributes.pptx +0 -0
- data/test/fixture/pptx/multi-slides.pptx +0 -0
- data/test/fixture/pptx/one-slide.pptx +0 -0
- data/test/fixture/xlsx/attributes.xlsx +0 -0
- data/test/fixture/xlsx/multi-sheets.xlsx +0 -0
- data/test/fixture/xlsx/one-sheet.xlsx +0 -0
- metadata +36 -2
@@ -0,0 +1,105 @@
|
|
1
|
+
# Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# This library is free software; you can redistribute it and/or
|
4
|
+
# modify it under the terms of the GNU Lesser General Public
|
5
|
+
# License as published by the Free Software Foundation; either
|
6
|
+
# version 2.1 of the License, or (at your option) any later version.
|
7
|
+
#
|
8
|
+
# This library is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
11
|
+
# Lesser General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU Lesser General Public
|
14
|
+
# License along with this library; if not, write to the Free Software
|
15
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
16
|
+
|
17
|
+
require "chupa-text/decomposers/opendocument"
|
18
|
+
|
19
|
+
module ChupaText
|
20
|
+
module Decomposers
|
21
|
+
class OpenDocumentPresentation < OpenDocument
|
22
|
+
registry.register("opendocument-presentation", self)
|
23
|
+
|
24
|
+
def initialize(options={})
|
25
|
+
super
|
26
|
+
@extension = "odp"
|
27
|
+
@mime_type = "application/vnd.oasis.opendocument.presentation"
|
28
|
+
end
|
29
|
+
|
30
|
+
private
|
31
|
+
def process_content(entry, context, &block)
|
32
|
+
context[:slides] = []
|
33
|
+
listener = SlidesListener.new(context[:slides])
|
34
|
+
parse(entry.file_data, listener)
|
35
|
+
end
|
36
|
+
|
37
|
+
def finish_decompose(context, &block)
|
38
|
+
metadata = TextData.new("", source_data: context[:data])
|
39
|
+
context[:attributes].each do |name, value|
|
40
|
+
metadata[name] = value
|
41
|
+
end
|
42
|
+
yield(metadata)
|
43
|
+
|
44
|
+
(context[:slides] || []).each_with_index do |slide, i|
|
45
|
+
text = slide[:text]
|
46
|
+
text_data = TextData.new(text, source_data: context[:data])
|
47
|
+
text_data["index"] = i
|
48
|
+
yield(text_data)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
class SlidesListener
|
53
|
+
include REXML::SAX2Listener
|
54
|
+
|
55
|
+
TEXT_URI = "urn:oasis:names:tc:opendocument:xmlns:text:1.0"
|
56
|
+
DRAW_URI = "urn:oasis:names:tc:opendocument:xmlns:drawing:1.0"
|
57
|
+
|
58
|
+
def initialize(slides)
|
59
|
+
@slides = slides
|
60
|
+
@in_p = false
|
61
|
+
end
|
62
|
+
|
63
|
+
def start_element(uri, local_name, qname, attributes)
|
64
|
+
case uri
|
65
|
+
when TEXT_URI
|
66
|
+
case local_name
|
67
|
+
when "p"
|
68
|
+
@in_p = true
|
69
|
+
end
|
70
|
+
when DRAW_URI
|
71
|
+
case local_name
|
72
|
+
when "page"
|
73
|
+
@slides << {text: ""}
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
def end_element(uri, local_name, qname)
|
79
|
+
@in_p = false
|
80
|
+
case uri
|
81
|
+
when TEXT_URI
|
82
|
+
case local_name
|
83
|
+
when "p"
|
84
|
+
@slides.last[:text] << "\n"
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def characters(text)
|
90
|
+
add_text(text)
|
91
|
+
end
|
92
|
+
|
93
|
+
def cdata(content)
|
94
|
+
add_text(content)
|
95
|
+
end
|
96
|
+
|
97
|
+
private
|
98
|
+
def add_text(text)
|
99
|
+
return unless @in_p
|
100
|
+
@slides.last[:text] << CGI.unescapeHTML(text)
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
@@ -0,0 +1,134 @@
|
|
1
|
+
# Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# This library is free software; you can redistribute it and/or
|
4
|
+
# modify it under the terms of the GNU Lesser General Public
|
5
|
+
# License as published by the Free Software Foundation; either
|
6
|
+
# version 2.1 of the License, or (at your option) any later version.
|
7
|
+
#
|
8
|
+
# This library is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
11
|
+
# Lesser General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU Lesser General Public
|
14
|
+
# License along with this library; if not, write to the Free Software
|
15
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
16
|
+
|
17
|
+
require "chupa-text/decomposers/opendocument"
|
18
|
+
|
19
|
+
module ChupaText
|
20
|
+
module Decomposers
|
21
|
+
class OpenDocumentSpreadsheet < OpenDocument
|
22
|
+
registry.register("opendocument-spreadsheet", self)
|
23
|
+
|
24
|
+
def initialize(options={})
|
25
|
+
super
|
26
|
+
@extension = "ods"
|
27
|
+
@mime_type = "application/vnd.oasis.opendocument.spreadsheet"
|
28
|
+
end
|
29
|
+
|
30
|
+
private
|
31
|
+
def process_content(entry, context, &block)
|
32
|
+
context[:sheets] = []
|
33
|
+
listener = SheetsListener.new(context[:sheets])
|
34
|
+
parse(entry.file_data, listener)
|
35
|
+
end
|
36
|
+
|
37
|
+
def finish_decompose(context, &block)
|
38
|
+
metadata = TextData.new("", source_data: context[:data])
|
39
|
+
context[:attributes].each do |name, value|
|
40
|
+
metadata[name] = value
|
41
|
+
end
|
42
|
+
yield(metadata)
|
43
|
+
|
44
|
+
(context[:sheets] || []).each_with_index do |sheet, i|
|
45
|
+
text = sheet[:text]
|
46
|
+
text_data = TextData.new(text, source_data: context[:data])
|
47
|
+
text_data["index"] = i
|
48
|
+
name = sheet[:name]
|
49
|
+
text_data["name"] = name if name
|
50
|
+
yield(text_data)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
class SheetsListener
|
55
|
+
include REXML::SAX2Listener
|
56
|
+
|
57
|
+
TEXT_URI = "urn:oasis:names:tc:opendocument:xmlns:text:1.0"
|
58
|
+
TABLE_URI = "urn:oasis:names:tc:opendocument:xmlns:table:1.0"
|
59
|
+
|
60
|
+
def initialize(sheets)
|
61
|
+
@sheets = sheets
|
62
|
+
@prefix_to_uri = {}
|
63
|
+
@uri_to_prefix = {}
|
64
|
+
@in_p = false
|
65
|
+
end
|
66
|
+
|
67
|
+
def start_prefix_mapping(prefix, uri)
|
68
|
+
@prefix_to_uri[prefix] = uri
|
69
|
+
@uri_to_prefix[uri] = prefix
|
70
|
+
end
|
71
|
+
|
72
|
+
def end_prefix_mapping(prefix)
|
73
|
+
uri = @prefix_to_uri.delete(prefix)
|
74
|
+
@uri_to_prefix.delete(uri)
|
75
|
+
end
|
76
|
+
|
77
|
+
def start_element(uri, local_name, qname, attributes)
|
78
|
+
case uri
|
79
|
+
when TEXT_URI
|
80
|
+
case local_name
|
81
|
+
when "p"
|
82
|
+
@in_p = true
|
83
|
+
end
|
84
|
+
when TABLE_URI
|
85
|
+
table_prefix = @uri_to_prefix[TABLE_URI]
|
86
|
+
case local_name
|
87
|
+
when "table"
|
88
|
+
@sheets << {
|
89
|
+
name: attributes["#{table_prefix}:name"],
|
90
|
+
rows: [],
|
91
|
+
}
|
92
|
+
when "table-row"
|
93
|
+
@sheets.last[:rows] << []
|
94
|
+
when "table-cell"
|
95
|
+
@sheets.last[:rows].last << {text: ""}
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
def end_element(uri, local_name, qname)
|
101
|
+
@in_p = false
|
102
|
+
case uri
|
103
|
+
when TABLE_URI
|
104
|
+
case local_name
|
105
|
+
when "table"
|
106
|
+
sheet = @sheets.last
|
107
|
+
text = ""
|
108
|
+
sheet[:rows].each do |row|
|
109
|
+
cell_texts = row.collect {|cell| cell[:text]}
|
110
|
+
next if cell_texts.all?(&:empty?)
|
111
|
+
text << cell_texts.join("\t") << "\n"
|
112
|
+
end
|
113
|
+
sheet[:text] = text
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
def characters(text)
|
119
|
+
add_text(text)
|
120
|
+
end
|
121
|
+
|
122
|
+
def cdata(content)
|
123
|
+
add_text(content)
|
124
|
+
end
|
125
|
+
|
126
|
+
private
|
127
|
+
def add_text(text)
|
128
|
+
return unless @in_p
|
129
|
+
@sheets.last[:rows].last.last[:text] << CGI.unescapeHTML(text)
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|
@@ -0,0 +1,89 @@
|
|
1
|
+
# Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# This library is free software; you can redistribute it and/or
|
4
|
+
# modify it under the terms of the GNU Lesser General Public
|
5
|
+
# License as published by the Free Software Foundation; either
|
6
|
+
# version 2.1 of the License, or (at your option) any later version.
|
7
|
+
#
|
8
|
+
# This library is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
11
|
+
# Lesser General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU Lesser General Public
|
14
|
+
# License along with this library; if not, write to the Free Software
|
15
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
16
|
+
|
17
|
+
require "chupa-text/decomposers/opendocument"
|
18
|
+
|
19
|
+
module ChupaText
|
20
|
+
module Decomposers
|
21
|
+
class OpenDocumentText < OpenDocument
|
22
|
+
registry.register("opendocument-text", self)
|
23
|
+
|
24
|
+
def initialize(options={})
|
25
|
+
super
|
26
|
+
@extension = "odt"
|
27
|
+
@mime_type = "application/vnd.oasis.opendocument.text"
|
28
|
+
end
|
29
|
+
|
30
|
+
private
|
31
|
+
def process_content(entry, context, &block)
|
32
|
+
context[:text] = ""
|
33
|
+
listener = TextListener.new(context[:text])
|
34
|
+
parse(entry.file_data, listener)
|
35
|
+
end
|
36
|
+
|
37
|
+
def finish_decompose(context, &block)
|
38
|
+
text_data = TextData.new(context[:text] || "",
|
39
|
+
source_data: context[:data])
|
40
|
+
context[:attributes].each do |name, value|
|
41
|
+
text_data[name] = value
|
42
|
+
end
|
43
|
+
yield(text_data)
|
44
|
+
end
|
45
|
+
|
46
|
+
class TextListener
|
47
|
+
include REXML::SAX2Listener
|
48
|
+
|
49
|
+
TEXT_URI = "urn:oasis:names:tc:opendocument:xmlns:text:1.0"
|
50
|
+
def initialize(output)
|
51
|
+
@output = output
|
52
|
+
@in_p = false
|
53
|
+
end
|
54
|
+
|
55
|
+
def start_element(uri, local_name, qname, attributes)
|
56
|
+
return unless uri == TEXT_URI
|
57
|
+
case local_name
|
58
|
+
when "p"
|
59
|
+
@in_p = true
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def end_element(uri, local_name, qname)
|
64
|
+
@in_p = false
|
65
|
+
|
66
|
+
return unless uri == TEXT_URI
|
67
|
+
case local_name
|
68
|
+
when "p"
|
69
|
+
@output << "\n"
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
def characters(text)
|
74
|
+
add_text(text)
|
75
|
+
end
|
76
|
+
|
77
|
+
def cdata(content)
|
78
|
+
add_text(content)
|
79
|
+
end
|
80
|
+
|
81
|
+
private
|
82
|
+
def add_text(text)
|
83
|
+
return unless @in_p
|
84
|
+
@output << CGI.unescapeHTML(text)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
@@ -0,0 +1,139 @@
|
|
1
|
+
# Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# This library is free software; you can redistribute it and/or
|
4
|
+
# modify it under the terms of the GNU Lesser General Public
|
5
|
+
# License as published by the Free Software Foundation; either
|
6
|
+
# version 2.1 of the License, or (at your option) any later version.
|
7
|
+
#
|
8
|
+
# This library is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
11
|
+
# Lesser General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU Lesser General Public
|
14
|
+
# License along with this library; if not, write to the Free Software
|
15
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
16
|
+
|
17
|
+
require "cgi/util"
|
18
|
+
require "rexml/parsers/sax2parser"
|
19
|
+
require "rexml/sax2listener"
|
20
|
+
|
21
|
+
require "archive/zip"
|
22
|
+
|
23
|
+
module ChupaText
|
24
|
+
module Decomposers
|
25
|
+
class OpenDocument < Decomposer
|
26
|
+
def target?(data)
|
27
|
+
data.extension == @extension or
|
28
|
+
data.mime_type == @mime_type
|
29
|
+
end
|
30
|
+
|
31
|
+
def target_score(data)
|
32
|
+
if target?(data)
|
33
|
+
-1
|
34
|
+
else
|
35
|
+
nil
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def decompose(data, &block)
|
40
|
+
context = {
|
41
|
+
data: data,
|
42
|
+
attributes: {},
|
43
|
+
}
|
44
|
+
data.open do |input|
|
45
|
+
Archive::Zip.open(input) do |zip|
|
46
|
+
zip.each do |entry|
|
47
|
+
next unless entry.file?
|
48
|
+
case entry.zip_path
|
49
|
+
when "content.xml"
|
50
|
+
process_content(entry, context, &block)
|
51
|
+
when "meta.xml"
|
52
|
+
process_meta(entry, context, &block)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
finish_decompose(context, &block)
|
58
|
+
end
|
59
|
+
|
60
|
+
private
|
61
|
+
def parse(io, listener)
|
62
|
+
source = REXML::Source.new(io.read)
|
63
|
+
parser = REXML::Parsers::SAX2Parser.new(source)
|
64
|
+
parser.listen(listener)
|
65
|
+
parser.parse
|
66
|
+
end
|
67
|
+
|
68
|
+
def process_meta(entry, context, &block)
|
69
|
+
listener = AttributesListener.new(context[:attributes])
|
70
|
+
parse(entry.file_data, listener)
|
71
|
+
end
|
72
|
+
|
73
|
+
class AttributesListener
|
74
|
+
include REXML::SAX2Listener
|
75
|
+
|
76
|
+
META_URI = "urn:oasis:names:tc:opendocument:xmlns:meta:1.0"
|
77
|
+
DUBLIN_CORE_URI = "http://purl.org/dc/elements/1.1/"
|
78
|
+
|
79
|
+
def initialize(attributes)
|
80
|
+
@attributes = attributes
|
81
|
+
@name = nil
|
82
|
+
@type = nil
|
83
|
+
end
|
84
|
+
|
85
|
+
def start_element(uri, local_name, qname, attributes)
|
86
|
+
case uri
|
87
|
+
when META_URI
|
88
|
+
case local_name
|
89
|
+
when "creation-date"
|
90
|
+
@name = "created_time"
|
91
|
+
@type = :w3cdtf
|
92
|
+
when "keyword"
|
93
|
+
@name = "keywords"
|
94
|
+
@type = :array
|
95
|
+
when "generator"
|
96
|
+
@name = local_name
|
97
|
+
end
|
98
|
+
when DUBLIN_CORE_URI
|
99
|
+
case local_name
|
100
|
+
when "date"
|
101
|
+
@name = "modified_time"
|
102
|
+
@type = :w3cdtf
|
103
|
+
when "description", "title", "subject"
|
104
|
+
@name = local_name
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
def end_element(uri, local_name, qname)
|
110
|
+
@name = nil
|
111
|
+
@type = nil
|
112
|
+
end
|
113
|
+
|
114
|
+
def characters(text)
|
115
|
+
set_attribute(text)
|
116
|
+
end
|
117
|
+
|
118
|
+
def cdata(content)
|
119
|
+
set_attribute(content)
|
120
|
+
end
|
121
|
+
|
122
|
+
def set_attribute(value)
|
123
|
+
return if @name.nil?
|
124
|
+
|
125
|
+
value = CGI.unescapeHTML(value)
|
126
|
+
case @type
|
127
|
+
when :w3cdtf
|
128
|
+
value = Time.xmlschema(value)
|
129
|
+
when :array
|
130
|
+
values = @attributes[@name] || []
|
131
|
+
values << value
|
132
|
+
value = values
|
133
|
+
end
|
134
|
+
@attributes[@name] = value
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|