chupa-text 1.1.3 → 1.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/doc/text/news.md +12 -0
  3. data/lib/chupa-text/command/chupa-text.rb +7 -1
  4. data/lib/chupa-text/decomposer.rb +8 -0
  5. data/lib/chupa-text/decomposers/office-open-xml-document.rb +51 -0
  6. data/lib/chupa-text/decomposers/office-open-xml-presentation.rb +67 -0
  7. data/lib/chupa-text/decomposers/office-open-xml-workbook.rb +114 -0
  8. data/lib/chupa-text/decomposers/office-open-xml.rb +196 -0
  9. data/lib/chupa-text/decomposers/opendocument-presentation.rb +105 -0
  10. data/lib/chupa-text/decomposers/opendocument-spreadsheet.rb +134 -0
  11. data/lib/chupa-text/decomposers/opendocument-text.rb +89 -0
  12. data/lib/chupa-text/decomposers/opendocument.rb +139 -0
  13. data/lib/chupa-text/extractor.rb +8 -2
  14. data/lib/chupa-text/formatters/mime.rb +3 -2
  15. data/lib/chupa-text/version.rb +1 -1
  16. data/test/decomposers/test-office-open-xml-document.rb +144 -0
  17. data/test/decomposers/test-office-open-xml-presentation.rb +133 -0
  18. data/test/decomposers/test-office-open-xml-workbook.rb +138 -0
  19. data/test/decomposers/test-open-document-presentation.rb +136 -0
  20. data/test/decomposers/test-open-document-spreadsheet.rb +152 -0
  21. data/test/decomposers/test-open-document-text.rb +144 -0
  22. data/test/fixture/docx/attributes.docx +0 -0
  23. data/test/fixture/docx/multi-pages.docx +0 -0
  24. data/test/fixture/docx/one-page.docx +0 -0
  25. data/test/fixture/docx/special-characters.docx +0 -0
  26. data/test/fixture/odp/attributes.odp +0 -0
  27. data/test/fixture/odp/multi-slides.odp +0 -0
  28. data/test/fixture/odp/one-slide.odp +0 -0
  29. data/test/fixture/ods/attributes.ods +0 -0
  30. data/test/fixture/ods/multi-sheets.ods +0 -0
  31. data/test/fixture/ods/one-sheet.ods +0 -0
  32. data/test/fixture/odt/attributes.odt +0 -0
  33. data/test/fixture/odt/multi-pages.odt +0 -0
  34. data/test/fixture/odt/one-page.odt +0 -0
  35. data/test/fixture/odt/special-characters.odt +0 -0
  36. data/test/fixture/pptx/attributes.pptx +0 -0
  37. data/test/fixture/pptx/multi-slides.pptx +0 -0
  38. data/test/fixture/pptx/one-slide.pptx +0 -0
  39. data/test/fixture/xlsx/attributes.xlsx +0 -0
  40. data/test/fixture/xlsx/multi-sheets.xlsx +0 -0
  41. data/test/fixture/xlsx/one-sheet.xlsx +0 -0
  42. metadata +36 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 38219bcd127b3e4b7ee11a46d51e7e6762b2e5ad637239518c9ec64cdb4906fc
4
- data.tar.gz: 279ac9fa242ec20c8c7d7710d5a26bb58bd73cbe0d3a6473e4b714d766743825
3
+ metadata.gz: ce1def40525d7278aa45cdbe3af69cd95656fa58d1d02e6ddb3e2677940ed7d6
4
+ data.tar.gz: 4ec184c4bd0f61508d4b1908e7c89abd8aeb01026c8b3590b404ca672887c6a1
5
5
  SHA512:
6
- metadata.gz: 776f7004429ff9a95bdbac88a50029cfc1cebdbf783ded8f91ddd6e29f79fc5fc8d113a616bfdfb5684b940a5cbb9ef47d4a9bff85cc6a9f69e1300997293207
7
- data.tar.gz: b0c5299e11c13377a68f3871bb1a673ec3216577f04430f3cb30352999ac3f8a5b7d9d9d2dabe5a30da053904662610b193d7364f42f414acd72c3d2b18df126
6
+ metadata.gz: ae43e4354761a953f61cda5348524f44346996133e4f77a310bfbfd07295b4548d41847f8132387ec9ade30c44a9c2bdd7936c7633085534ce03bb5db6f9061f
7
+ data.tar.gz: f9174e01e21a2dbbc1647d191084969e9fd0cf4dc6f24a469ac7c6c4f2378d7a11946e6b2350e8df86ad5d007c6f048c804d4c1c2b79e95628511383c76c6061
@@ -1,5 +1,17 @@
1
1
  # News
2
2
 
3
+ ## 1.1.4: 2019-02-26
4
+
5
+ ### Improvements
6
+
7
+ * Added support for decomposer selection by score.
8
+
9
+ * Added support for Office Open XML.
10
+
11
+ * Added support for OpenDocument.
12
+
13
+ * `chupa-text`: Added `--mime-boundary` option.
14
+
3
15
  ## 1.1.3: 2018-07-18
4
16
 
5
17
  ### Improvements
@@ -46,6 +46,7 @@ module ChupaText
46
46
  @uri = nil
47
47
  @mime_type = nil
48
48
  @format = :json
49
+ @mime_formatter_options = {}
49
50
  @need_screenshot = true
50
51
  @expected_screenshot_size = [200, 200]
51
52
  end
@@ -127,6 +128,11 @@ module ChupaText
127
128
  "(default: #{@format})") do |format|
128
129
  @format = format
129
130
  end
131
+ parser.on("--mime-boundary=BOUNDARY",
132
+ "Use BOUNDARY for MIME boundary.",
133
+ "(default: Use SHA1 digest of URI)") do |boundary|
134
+ @mime_formatter_options[:boundary] = boundary
135
+ end
130
136
  parser.on("--[no-]need-screenshot",
131
137
  "Generate screenshot if available.",
132
138
  "(default: #{@need_screenshot})") do |boolean|
@@ -220,7 +226,7 @@ module ChupaText
220
226
  when :text
221
227
  Formatters::Text.new($stdout)
222
228
  when :mime
223
- Formatters::MIME.new($stdout)
229
+ Formatters::MIME.new($stdout, @mime_formatter_options)
224
230
  end
225
231
  end
226
232
  end
@@ -30,6 +30,14 @@ module ChupaText
30
30
  raise NotImplementedError, "must implement #{self.class}\##{__method__}"
31
31
  end
32
32
 
33
+ def target_score(data)
34
+ if target?(data)
35
+ 0
36
+ else
37
+ nil
38
+ end
39
+ end
40
+
33
41
  def decompose(data)
34
42
  raise NotImplementedError, "must implement #{self.class}\##{__method__}"
35
43
  end
@@ -0,0 +1,51 @@
1
+ # Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ require "chupa-text/decomposers/office-open-xml"
18
+
19
+ module ChupaText
20
+ module Decomposers
21
+ class OfficeOpenXMLDocument < OfficeOpenXML
22
+ registry.register("office-open-xml-document", self)
23
+
24
+ def initialize(options={})
25
+ super
26
+ @extensions = [
27
+ "docx",
28
+ "docm",
29
+ "dotx",
30
+ "dotm",
31
+ ]
32
+ @mime_types = [
33
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
34
+ "application/vnd.ms-word.document.macroEnabled.12",
35
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.template",
36
+ "application/vnd.ms-word.template.macroEnabled.12",
37
+ ]
38
+ @namespace_uri =
39
+ "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
40
+ end
41
+
42
+ private
43
+ def process_entry(entry, context)
44
+ case entry.zip_path
45
+ when "word/document.xml"
46
+ extract_text(entry, context[:text])
47
+ end
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,67 @@
1
+ # Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ require "chupa-text/decomposers/office-open-xml"
18
+
19
+ module ChupaText
20
+ module Decomposers
21
+ class OfficeOpenXMLPresentation < OfficeOpenXML
22
+ registry.register("office-open-xml-presentation", self)
23
+
24
+ def initialize(options={})
25
+ super
26
+ @extensions = [
27
+ "pptx",
28
+ "pptm",
29
+ "ppsx",
30
+ "ppsm",
31
+ "potx",
32
+ "potm",
33
+ "sldx",
34
+ "sldm",
35
+ ]
36
+ @mime_types = [
37
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation",
38
+ "application/vnd.ms-powerpoint.presentation.macroEnabled.12",
39
+ "application/vnd.openxmlformats-officedocument.presentationml.slideshow",
40
+ "application/vnd.ms-powerpoint.slideshow.macroEnabled.12",
41
+ "application/vnd.openxmlformats-officedocument.presentationml.template",
42
+ "application/vnd.ms-powerpoint.template.macroEnabled.12",
43
+ "application/vnd.openxmlformats-officedocument.presentationml.slide",
44
+ "application/vnd.ms-powerpoint.slide.macroEnabled.12",
45
+ ]
46
+ @namespace_uri =
47
+ "http://schemas.openxmlformats.org/drawingml/2006/main"
48
+ end
49
+
50
+ private
51
+ def process_entry(entry, context)
52
+ case entry.zip_path
53
+ when /\Appt\/slides\/slide(\d+)\.xml/
54
+ nth_slide = Integer($1, 10)
55
+ slide_text = ""
56
+ extract_text(entry, slide_text)
57
+ context[:slides] ||= []
58
+ context[:slides] << [nth_slide, slide_text]
59
+ end
60
+ end
61
+
62
+ def accumulate_text(context)
63
+ context[:slides].sort_by(&:first).collect(&:last).join("\n")
64
+ end
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,114 @@
1
+ # Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ require "chupa-text/decomposers/office-open-xml"
18
+
19
+ module ChupaText
20
+ module Decomposers
21
+ class OfficeOpenXMLWorkbook < OfficeOpenXML
22
+ registry.register("office-open-xml-workbook", self)
23
+
24
+ def initialize(options={})
25
+ super
26
+ @extensions = [
27
+ "xlsx",
28
+ "xlsm",
29
+ "xltx",
30
+ "xltm",
31
+ ]
32
+ @mime_types = [
33
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
34
+ "application/vnd.ms-excel.sheet.macroEnabled.12",
35
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.template",
36
+ "application/vnd.ms-excel.template.macroEnabled.12",
37
+ ]
38
+ @namespace_uri =
39
+ "http://schemas.openxmlformats.org/spreadsheetml/2006/main"
40
+ end
41
+
42
+ private
43
+ def process_entry(entry, context)
44
+ case entry.zip_path
45
+ when "xl/sharedStrings.xml"
46
+ context[:shared_strings] = []
47
+ extract_text(entry, context[:shared_strings])
48
+ when /\Axl\/worksheets\/sheet(\d+)\.xml\z/
49
+ nth_sheet = Integer($1, 10)
50
+ sheet = []
51
+ listener = SheetListener.new(sheet)
52
+ parse(entry.file_data, listener)
53
+ context[:sheets] ||= []
54
+ context[:sheets] << [nth_sheet, sheet]
55
+ end
56
+ end
57
+
58
+ def accumulate_text(context)
59
+ shared_strings = context[:shared_strings]
60
+ sheets = context[:sheets].sort_by(&:first).collect(&:last)
61
+ sheet_texts = sheets.collect do |sheet|
62
+ sheet_text = ""
63
+ sheet.each do |row|
64
+ row_texts = row.collect do |index|
65
+ shared_strings[index]
66
+ end
67
+ sheet_text << row_texts.join("\t") << "\n"
68
+ end
69
+ sheet_text
70
+ end
71
+ sheet_texts.join("\n")
72
+ end
73
+
74
+ class SheetListener
75
+ include REXML::SAX2Listener
76
+
77
+ URI = "http://schemas.openxmlformats.org/spreadsheetml/2006/main"
78
+
79
+ def initialize(sheet)
80
+ @sheet = sheet
81
+ @in_v = false
82
+ end
83
+
84
+ def start_element(uri, local_name, qname, attributes)
85
+ return unless uri == URI
86
+ case local_name
87
+ when "row"
88
+ @sheet << []
89
+ when "v"
90
+ @in_v = true
91
+ end
92
+ end
93
+
94
+ def end_element(uri, local_name, qname)
95
+ @in_v = false
96
+ end
97
+
98
+ def characters(text)
99
+ add_column(text)
100
+ end
101
+
102
+ def cdata(content)
103
+ add_column(content)
104
+ end
105
+
106
+ private
107
+ def add_column(text)
108
+ return unless @in_v
109
+ @sheet.last << Integer(text, 10)
110
+ end
111
+ end
112
+ end
113
+ end
114
+ end
@@ -0,0 +1,196 @@
1
+ # Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ require "cgi/util"
18
+ require "rexml/parsers/sax2parser"
19
+ require "rexml/sax2listener"
20
+
21
+ require "archive/zip"
22
+
23
+ module ChupaText
24
+ module Decomposers
25
+ class OfficeOpenXML < Decomposer
26
+ def target?(data)
27
+ @extensions.include?(data.extension) or
28
+ @mime_types.include?(data.mime_type)
29
+ end
30
+
31
+ def target_score(data)
32
+ if target?(data)
33
+ -1
34
+ else
35
+ nil
36
+ end
37
+ end
38
+
39
+ def decompose(data)
40
+ context = {
41
+ text: "",
42
+ attributes: {},
43
+ }
44
+ data.open do |input|
45
+ Archive::Zip.open(input) do |zip|
46
+ zip.each do |entry|
47
+ next unless entry.file?
48
+ case entry.zip_path
49
+ when "docProps/app.xml"
50
+ listener = AttributesListener.new(context[:attributes])
51
+ parse(entry.file_data, listener)
52
+ when "docProps/core.xml"
53
+ listener = AttributesListener.new(context[:attributes])
54
+ parse(entry.file_data, listener)
55
+ else
56
+ process_entry(entry, context)
57
+ end
58
+ end
59
+ end
60
+ end
61
+ text = accumulate_text(context)
62
+ text_data = TextData.new(text, source_data: data)
63
+ context[:attributes].each do |name, value|
64
+ text_data[name] = value
65
+ end
66
+ yield(text_data)
67
+ end
68
+
69
+ private
70
+ def parse(io, listener)
71
+ source = REXML::Source.new(io.read)
72
+ parser = REXML::Parsers::SAX2Parser.new(source)
73
+ parser.listen(listener)
74
+ parser.parse
75
+ end
76
+
77
+ def extract_text(entry, texts)
78
+ listener = TextListener.new(texts, @namespace_uri)
79
+ parse(entry.file_data, listener)
80
+ end
81
+
82
+ def accumulate_text(context)
83
+ context[:text]
84
+ end
85
+
86
+ class TextListener
87
+ include REXML::SAX2Listener
88
+
89
+ def initialize(output, target_uri)
90
+ @output = output
91
+ @target_uri = target_uri
92
+ @in_target = false
93
+ end
94
+
95
+ def start_element(uri, local_name, qname, attributes)
96
+ return unless uri == @target_uri
97
+ case local_name
98
+ when "t"
99
+ @in_target = true
100
+ end
101
+ end
102
+
103
+ def end_element(uri, local_name, qname)
104
+ @in_target = false
105
+
106
+ return unless uri == @target_uri
107
+ case local_name
108
+ when "p", "br"
109
+ @output << "\n"
110
+ end
111
+ end
112
+
113
+ def characters(text)
114
+ add_text(text)
115
+ end
116
+
117
+ def cdata(content)
118
+ add_text(content)
119
+ end
120
+
121
+ private
122
+ def add_text(text)
123
+ return unless @in_target
124
+ @output << CGI.unescapeHTML(text)
125
+ end
126
+ end
127
+
128
+ class AttributesListener
129
+ include REXML::SAX2Listener
130
+
131
+ CORE_PROPERTIES_URI =
132
+ "http://schemas.openxmlformats.org/package/2006/metadata/core-properties"
133
+ EXTENDED_PROPERTIES_URI =
134
+ "http://schemas.openxmlformats.org/officeDocument/2006/extended-properties"
135
+ DUBLIN_CORE_URI = "http://purl.org/dc/elements/1.1/"
136
+ DUBLIN_CORE_TERMS_URI = "http://purl.org/dc/terms/"
137
+
138
+ def initialize(attributes)
139
+ @attributes = attributes
140
+ @name = nil
141
+ @type = nil
142
+ end
143
+
144
+ def start_element(uri, local_name, qname, attributes)
145
+ case uri
146
+ when CORE_PROPERTIES_URI
147
+ case local_name
148
+ when "keywords"
149
+ @name = local_name
150
+ end
151
+ when EXTENDED_PROPERTIES_URI
152
+ case local_name
153
+ when "Application"
154
+ @name = local_name.downcase
155
+ end
156
+ when DUBLIN_CORE_URI
157
+ case local_name
158
+ when "description", "title", "subject"
159
+ @name = local_name
160
+ end
161
+ when DUBLIN_CORE_TERMS_URI
162
+ case local_name
163
+ when "created", "modified"
164
+ @name = "#{local_name}_time"
165
+ @type = :w3cdtf
166
+ end
167
+ end
168
+ end
169
+
170
+ def end_element(uri, local_name, qname)
171
+ @name = nil
172
+ @type = nil
173
+ end
174
+
175
+ def characters(text)
176
+ set_attribute(text)
177
+ end
178
+
179
+ def cdata(content)
180
+ set_attribute(content)
181
+ end
182
+
183
+ def set_attribute(value)
184
+ return if @name.nil?
185
+
186
+ value = CGI.unescapeHTML(value)
187
+ case @type
188
+ when :w3cdtf
189
+ value = Time.xmlschema(value)
190
+ end
191
+ @attributes[@name] = value
192
+ end
193
+ end
194
+ end
195
+ end
196
+ end