chupa-text 1.1.3 → 1.1.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/doc/text/news.md +12 -0
  3. data/lib/chupa-text/command/chupa-text.rb +7 -1
  4. data/lib/chupa-text/decomposer.rb +8 -0
  5. data/lib/chupa-text/decomposers/office-open-xml-document.rb +51 -0
  6. data/lib/chupa-text/decomposers/office-open-xml-presentation.rb +67 -0
  7. data/lib/chupa-text/decomposers/office-open-xml-workbook.rb +114 -0
  8. data/lib/chupa-text/decomposers/office-open-xml.rb +196 -0
  9. data/lib/chupa-text/decomposers/opendocument-presentation.rb +105 -0
  10. data/lib/chupa-text/decomposers/opendocument-spreadsheet.rb +134 -0
  11. data/lib/chupa-text/decomposers/opendocument-text.rb +89 -0
  12. data/lib/chupa-text/decomposers/opendocument.rb +139 -0
  13. data/lib/chupa-text/extractor.rb +8 -2
  14. data/lib/chupa-text/formatters/mime.rb +3 -2
  15. data/lib/chupa-text/version.rb +1 -1
  16. data/test/decomposers/test-office-open-xml-document.rb +144 -0
  17. data/test/decomposers/test-office-open-xml-presentation.rb +133 -0
  18. data/test/decomposers/test-office-open-xml-workbook.rb +138 -0
  19. data/test/decomposers/test-open-document-presentation.rb +136 -0
  20. data/test/decomposers/test-open-document-spreadsheet.rb +152 -0
  21. data/test/decomposers/test-open-document-text.rb +144 -0
  22. data/test/fixture/docx/attributes.docx +0 -0
  23. data/test/fixture/docx/multi-pages.docx +0 -0
  24. data/test/fixture/docx/one-page.docx +0 -0
  25. data/test/fixture/docx/special-characters.docx +0 -0
  26. data/test/fixture/odp/attributes.odp +0 -0
  27. data/test/fixture/odp/multi-slides.odp +0 -0
  28. data/test/fixture/odp/one-slide.odp +0 -0
  29. data/test/fixture/ods/attributes.ods +0 -0
  30. data/test/fixture/ods/multi-sheets.ods +0 -0
  31. data/test/fixture/ods/one-sheet.ods +0 -0
  32. data/test/fixture/odt/attributes.odt +0 -0
  33. data/test/fixture/odt/multi-pages.odt +0 -0
  34. data/test/fixture/odt/one-page.odt +0 -0
  35. data/test/fixture/odt/special-characters.odt +0 -0
  36. data/test/fixture/pptx/attributes.pptx +0 -0
  37. data/test/fixture/pptx/multi-slides.pptx +0 -0
  38. data/test/fixture/pptx/one-slide.pptx +0 -0
  39. data/test/fixture/xlsx/attributes.xlsx +0 -0
  40. data/test/fixture/xlsx/multi-sheets.xlsx +0 -0
  41. data/test/fixture/xlsx/one-sheet.xlsx +0 -0
  42. metadata +36 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 38219bcd127b3e4b7ee11a46d51e7e6762b2e5ad637239518c9ec64cdb4906fc
4
- data.tar.gz: 279ac9fa242ec20c8c7d7710d5a26bb58bd73cbe0d3a6473e4b714d766743825
3
+ metadata.gz: ce1def40525d7278aa45cdbe3af69cd95656fa58d1d02e6ddb3e2677940ed7d6
4
+ data.tar.gz: 4ec184c4bd0f61508d4b1908e7c89abd8aeb01026c8b3590b404ca672887c6a1
5
5
  SHA512:
6
- metadata.gz: 776f7004429ff9a95bdbac88a50029cfc1cebdbf783ded8f91ddd6e29f79fc5fc8d113a616bfdfb5684b940a5cbb9ef47d4a9bff85cc6a9f69e1300997293207
7
- data.tar.gz: b0c5299e11c13377a68f3871bb1a673ec3216577f04430f3cb30352999ac3f8a5b7d9d9d2dabe5a30da053904662610b193d7364f42f414acd72c3d2b18df126
6
+ metadata.gz: ae43e4354761a953f61cda5348524f44346996133e4f77a310bfbfd07295b4548d41847f8132387ec9ade30c44a9c2bdd7936c7633085534ce03bb5db6f9061f
7
+ data.tar.gz: f9174e01e21a2dbbc1647d191084969e9fd0cf4dc6f24a469ac7c6c4f2378d7a11946e6b2350e8df86ad5d007c6f048c804d4c1c2b79e95628511383c76c6061
@@ -1,5 +1,17 @@
1
1
  # News
2
2
 
3
+ ## 1.1.4: 2019-02-26
4
+
5
+ ### Improvements
6
+
7
+ * Added support for decomposer selection by score.
8
+
9
+ * Added support for Office Open XML.
10
+
11
+ * Added support for OpenDocument.
12
+
13
+ * `chupa-text`: Added `--mime-boundary` option.
14
+
3
15
  ## 1.1.3: 2018-07-18
4
16
 
5
17
  ### Improvements
@@ -46,6 +46,7 @@ module ChupaText
46
46
  @uri = nil
47
47
  @mime_type = nil
48
48
  @format = :json
49
+ @mime_formatter_options = {}
49
50
  @need_screenshot = true
50
51
  @expected_screenshot_size = [200, 200]
51
52
  end
@@ -127,6 +128,11 @@ module ChupaText
127
128
  "(default: #{@format})") do |format|
128
129
  @format = format
129
130
  end
131
+ parser.on("--mime-boundary=BOUNDARY",
132
+ "Use BOUNDARY for MIME boundary.",
133
+ "(default: Use SHA1 digest of URI)") do |boundary|
134
+ @mime_formatter_options[:boundary] = boundary
135
+ end
130
136
  parser.on("--[no-]need-screenshot",
131
137
  "Generate screenshot if available.",
132
138
  "(default: #{@need_screenshot})") do |boolean|
@@ -220,7 +226,7 @@ module ChupaText
220
226
  when :text
221
227
  Formatters::Text.new($stdout)
222
228
  when :mime
223
- Formatters::MIME.new($stdout)
229
+ Formatters::MIME.new($stdout, @mime_formatter_options)
224
230
  end
225
231
  end
226
232
  end
@@ -30,6 +30,14 @@ module ChupaText
30
30
  raise NotImplementedError, "must implement #{self.class}\##{__method__}"
31
31
  end
32
32
 
33
+ def target_score(data)
34
+ if target?(data)
35
+ 0
36
+ else
37
+ nil
38
+ end
39
+ end
40
+
33
41
  def decompose(data)
34
42
  raise NotImplementedError, "must implement #{self.class}\##{__method__}"
35
43
  end
@@ -0,0 +1,51 @@
1
+ # Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ require "chupa-text/decomposers/office-open-xml"
18
+
19
+ module ChupaText
20
+ module Decomposers
21
+ class OfficeOpenXMLDocument < OfficeOpenXML
22
+ registry.register("office-open-xml-document", self)
23
+
24
+ def initialize(options={})
25
+ super
26
+ @extensions = [
27
+ "docx",
28
+ "docm",
29
+ "dotx",
30
+ "dotm",
31
+ ]
32
+ @mime_types = [
33
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
34
+ "application/vnd.ms-word.document.macroEnabled.12",
35
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.template",
36
+ "application/vnd.ms-word.template.macroEnabled.12",
37
+ ]
38
+ @namespace_uri =
39
+ "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
40
+ end
41
+
42
+ private
43
+ def process_entry(entry, context)
44
+ case entry.zip_path
45
+ when "word/document.xml"
46
+ extract_text(entry, context[:text])
47
+ end
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,67 @@
1
+ # Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ require "chupa-text/decomposers/office-open-xml"
18
+
19
+ module ChupaText
20
+ module Decomposers
21
+ class OfficeOpenXMLPresentation < OfficeOpenXML
22
+ registry.register("office-open-xml-presentation", self)
23
+
24
+ def initialize(options={})
25
+ super
26
+ @extensions = [
27
+ "pptx",
28
+ "pptm",
29
+ "ppsx",
30
+ "ppsm",
31
+ "potx",
32
+ "potm",
33
+ "sldx",
34
+ "sldm",
35
+ ]
36
+ @mime_types = [
37
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation",
38
+ "application/vnd.ms-powerpoint.presentation.macroEnabled.12",
39
+ "application/vnd.openxmlformats-officedocument.presentationml.slideshow",
40
+ "application/vnd.ms-powerpoint.slideshow.macroEnabled.12",
41
+ "application/vnd.openxmlformats-officedocument.presentationml.template",
42
+ "application/vnd.ms-powerpoint.template.macroEnabled.12",
43
+ "application/vnd.openxmlformats-officedocument.presentationml.slide",
44
+ "application/vnd.ms-powerpoint.slide.macroEnabled.12",
45
+ ]
46
+ @namespace_uri =
47
+ "http://schemas.openxmlformats.org/drawingml/2006/main"
48
+ end
49
+
50
+ private
51
+ def process_entry(entry, context)
52
+ case entry.zip_path
53
+ when /\Appt\/slides\/slide(\d+)\.xml/
54
+ nth_slide = Integer($1, 10)
55
+ slide_text = ""
56
+ extract_text(entry, slide_text)
57
+ context[:slides] ||= []
58
+ context[:slides] << [nth_slide, slide_text]
59
+ end
60
+ end
61
+
62
+ def accumulate_text(context)
63
+ context[:slides].sort_by(&:first).collect(&:last).join("\n")
64
+ end
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,114 @@
1
+ # Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ require "chupa-text/decomposers/office-open-xml"
18
+
19
+ module ChupaText
20
+ module Decomposers
21
+ class OfficeOpenXMLWorkbook < OfficeOpenXML
22
+ registry.register("office-open-xml-workbook", self)
23
+
24
+ def initialize(options={})
25
+ super
26
+ @extensions = [
27
+ "xlsx",
28
+ "xlsm",
29
+ "xltx",
30
+ "xltm",
31
+ ]
32
+ @mime_types = [
33
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
34
+ "application/vnd.ms-excel.sheet.macroEnabled.12",
35
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.template",
36
+ "application/vnd.ms-excel.template.macroEnabled.12",
37
+ ]
38
+ @namespace_uri =
39
+ "http://schemas.openxmlformats.org/spreadsheetml/2006/main"
40
+ end
41
+
42
+ private
43
+ def process_entry(entry, context)
44
+ case entry.zip_path
45
+ when "xl/sharedStrings.xml"
46
+ context[:shared_strings] = []
47
+ extract_text(entry, context[:shared_strings])
48
+ when /\Axl\/worksheets\/sheet(\d+)\.xml\z/
49
+ nth_sheet = Integer($1, 10)
50
+ sheet = []
51
+ listener = SheetListener.new(sheet)
52
+ parse(entry.file_data, listener)
53
+ context[:sheets] ||= []
54
+ context[:sheets] << [nth_sheet, sheet]
55
+ end
56
+ end
57
+
58
+ def accumulate_text(context)
59
+ shared_strings = context[:shared_strings]
60
+ sheets = context[:sheets].sort_by(&:first).collect(&:last)
61
+ sheet_texts = sheets.collect do |sheet|
62
+ sheet_text = ""
63
+ sheet.each do |row|
64
+ row_texts = row.collect do |index|
65
+ shared_strings[index]
66
+ end
67
+ sheet_text << row_texts.join("\t") << "\n"
68
+ end
69
+ sheet_text
70
+ end
71
+ sheet_texts.join("\n")
72
+ end
73
+
74
+ class SheetListener
75
+ include REXML::SAX2Listener
76
+
77
+ URI = "http://schemas.openxmlformats.org/spreadsheetml/2006/main"
78
+
79
+ def initialize(sheet)
80
+ @sheet = sheet
81
+ @in_v = false
82
+ end
83
+
84
+ def start_element(uri, local_name, qname, attributes)
85
+ return unless uri == URI
86
+ case local_name
87
+ when "row"
88
+ @sheet << []
89
+ when "v"
90
+ @in_v = true
91
+ end
92
+ end
93
+
94
+ def end_element(uri, local_name, qname)
95
+ @in_v = false
96
+ end
97
+
98
+ def characters(text)
99
+ add_column(text)
100
+ end
101
+
102
+ def cdata(content)
103
+ add_column(content)
104
+ end
105
+
106
+ private
107
+ def add_column(text)
108
+ return unless @in_v
109
+ @sheet.last << Integer(text, 10)
110
+ end
111
+ end
112
+ end
113
+ end
114
+ end
@@ -0,0 +1,196 @@
1
+ # Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ require "cgi/util"
18
+ require "rexml/parsers/sax2parser"
19
+ require "rexml/sax2listener"
20
+
21
+ require "archive/zip"
22
+
23
+ module ChupaText
24
+ module Decomposers
25
+ class OfficeOpenXML < Decomposer
26
+ def target?(data)
27
+ @extensions.include?(data.extension) or
28
+ @mime_types.include?(data.mime_type)
29
+ end
30
+
31
+ def target_score(data)
32
+ if target?(data)
33
+ -1
34
+ else
35
+ nil
36
+ end
37
+ end
38
+
39
+ def decompose(data)
40
+ context = {
41
+ text: "",
42
+ attributes: {},
43
+ }
44
+ data.open do |input|
45
+ Archive::Zip.open(input) do |zip|
46
+ zip.each do |entry|
47
+ next unless entry.file?
48
+ case entry.zip_path
49
+ when "docProps/app.xml"
50
+ listener = AttributesListener.new(context[:attributes])
51
+ parse(entry.file_data, listener)
52
+ when "docProps/core.xml"
53
+ listener = AttributesListener.new(context[:attributes])
54
+ parse(entry.file_data, listener)
55
+ else
56
+ process_entry(entry, context)
57
+ end
58
+ end
59
+ end
60
+ end
61
+ text = accumulate_text(context)
62
+ text_data = TextData.new(text, source_data: data)
63
+ context[:attributes].each do |name, value|
64
+ text_data[name] = value
65
+ end
66
+ yield(text_data)
67
+ end
68
+
69
+ private
70
+ def parse(io, listener)
71
+ source = REXML::Source.new(io.read)
72
+ parser = REXML::Parsers::SAX2Parser.new(source)
73
+ parser.listen(listener)
74
+ parser.parse
75
+ end
76
+
77
+ def extract_text(entry, texts)
78
+ listener = TextListener.new(texts, @namespace_uri)
79
+ parse(entry.file_data, listener)
80
+ end
81
+
82
+ def accumulate_text(context)
83
+ context[:text]
84
+ end
85
+
86
+ class TextListener
87
+ include REXML::SAX2Listener
88
+
89
+ def initialize(output, target_uri)
90
+ @output = output
91
+ @target_uri = target_uri
92
+ @in_target = false
93
+ end
94
+
95
+ def start_element(uri, local_name, qname, attributes)
96
+ return unless uri == @target_uri
97
+ case local_name
98
+ when "t"
99
+ @in_target = true
100
+ end
101
+ end
102
+
103
+ def end_element(uri, local_name, qname)
104
+ @in_target = false
105
+
106
+ return unless uri == @target_uri
107
+ case local_name
108
+ when "p", "br"
109
+ @output << "\n"
110
+ end
111
+ end
112
+
113
+ def characters(text)
114
+ add_text(text)
115
+ end
116
+
117
+ def cdata(content)
118
+ add_text(content)
119
+ end
120
+
121
+ private
122
+ def add_text(text)
123
+ return unless @in_target
124
+ @output << CGI.unescapeHTML(text)
125
+ end
126
+ end
127
+
128
+ class AttributesListener
129
+ include REXML::SAX2Listener
130
+
131
+ CORE_PROPERTIES_URI =
132
+ "http://schemas.openxmlformats.org/package/2006/metadata/core-properties"
133
+ EXTENDED_PROPERTIES_URI =
134
+ "http://schemas.openxmlformats.org/officeDocument/2006/extended-properties"
135
+ DUBLIN_CORE_URI = "http://purl.org/dc/elements/1.1/"
136
+ DUBLIN_CORE_TERMS_URI = "http://purl.org/dc/terms/"
137
+
138
+ def initialize(attributes)
139
+ @attributes = attributes
140
+ @name = nil
141
+ @type = nil
142
+ end
143
+
144
+ def start_element(uri, local_name, qname, attributes)
145
+ case uri
146
+ when CORE_PROPERTIES_URI
147
+ case local_name
148
+ when "keywords"
149
+ @name = local_name
150
+ end
151
+ when EXTENDED_PROPERTIES_URI
152
+ case local_name
153
+ when "Application"
154
+ @name = local_name.downcase
155
+ end
156
+ when DUBLIN_CORE_URI
157
+ case local_name
158
+ when "description", "title", "subject"
159
+ @name = local_name
160
+ end
161
+ when DUBLIN_CORE_TERMS_URI
162
+ case local_name
163
+ when "created", "modified"
164
+ @name = "#{local_name}_time"
165
+ @type = :w3cdtf
166
+ end
167
+ end
168
+ end
169
+
170
+ def end_element(uri, local_name, qname)
171
+ @name = nil
172
+ @type = nil
173
+ end
174
+
175
+ def characters(text)
176
+ set_attribute(text)
177
+ end
178
+
179
+ def cdata(content)
180
+ set_attribute(content)
181
+ end
182
+
183
+ def set_attribute(value)
184
+ return if @name.nil?
185
+
186
+ value = CGI.unescapeHTML(value)
187
+ case @type
188
+ when :w3cdtf
189
+ value = Time.xmlschema(value)
190
+ end
191
+ @attributes[@name] = value
192
+ end
193
+ end
194
+ end
195
+ end
196
+ end