chupa-text 1.1.3 → 1.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/doc/text/news.md +12 -0
  3. data/lib/chupa-text/command/chupa-text.rb +7 -1
  4. data/lib/chupa-text/decomposer.rb +8 -0
  5. data/lib/chupa-text/decomposers/office-open-xml-document.rb +51 -0
  6. data/lib/chupa-text/decomposers/office-open-xml-presentation.rb +67 -0
  7. data/lib/chupa-text/decomposers/office-open-xml-workbook.rb +114 -0
  8. data/lib/chupa-text/decomposers/office-open-xml.rb +196 -0
  9. data/lib/chupa-text/decomposers/opendocument-presentation.rb +105 -0
  10. data/lib/chupa-text/decomposers/opendocument-spreadsheet.rb +134 -0
  11. data/lib/chupa-text/decomposers/opendocument-text.rb +89 -0
  12. data/lib/chupa-text/decomposers/opendocument.rb +139 -0
  13. data/lib/chupa-text/extractor.rb +8 -2
  14. data/lib/chupa-text/formatters/mime.rb +3 -2
  15. data/lib/chupa-text/version.rb +1 -1
  16. data/test/decomposers/test-office-open-xml-document.rb +144 -0
  17. data/test/decomposers/test-office-open-xml-presentation.rb +133 -0
  18. data/test/decomposers/test-office-open-xml-workbook.rb +138 -0
  19. data/test/decomposers/test-open-document-presentation.rb +136 -0
  20. data/test/decomposers/test-open-document-spreadsheet.rb +152 -0
  21. data/test/decomposers/test-open-document-text.rb +144 -0
  22. data/test/fixture/docx/attributes.docx +0 -0
  23. data/test/fixture/docx/multi-pages.docx +0 -0
  24. data/test/fixture/docx/one-page.docx +0 -0
  25. data/test/fixture/docx/special-characters.docx +0 -0
  26. data/test/fixture/odp/attributes.odp +0 -0
  27. data/test/fixture/odp/multi-slides.odp +0 -0
  28. data/test/fixture/odp/one-slide.odp +0 -0
  29. data/test/fixture/ods/attributes.ods +0 -0
  30. data/test/fixture/ods/multi-sheets.ods +0 -0
  31. data/test/fixture/ods/one-sheet.ods +0 -0
  32. data/test/fixture/odt/attributes.odt +0 -0
  33. data/test/fixture/odt/multi-pages.odt +0 -0
  34. data/test/fixture/odt/one-page.odt +0 -0
  35. data/test/fixture/odt/special-characters.odt +0 -0
  36. data/test/fixture/pptx/attributes.pptx +0 -0
  37. data/test/fixture/pptx/multi-slides.pptx +0 -0
  38. data/test/fixture/pptx/one-slide.pptx +0 -0
  39. data/test/fixture/xlsx/attributes.xlsx +0 -0
  40. data/test/fixture/xlsx/multi-sheets.xlsx +0 -0
  41. data/test/fixture/xlsx/one-sheet.xlsx +0 -0
  42. metadata +36 -2
@@ -0,0 +1,105 @@
1
+ # Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ require "chupa-text/decomposers/opendocument"
18
+
19
+ module ChupaText
20
+ module Decomposers
21
+ class OpenDocumentPresentation < OpenDocument
22
+ registry.register("opendocument-presentation", self)
23
+
24
+ def initialize(options={})
25
+ super
26
+ @extension = "odp"
27
+ @mime_type = "application/vnd.oasis.opendocument.presentation"
28
+ end
29
+
30
+ private
31
+ def process_content(entry, context, &block)
32
+ context[:slides] = []
33
+ listener = SlidesListener.new(context[:slides])
34
+ parse(entry.file_data, listener)
35
+ end
36
+
37
+ def finish_decompose(context, &block)
38
+ metadata = TextData.new("", source_data: context[:data])
39
+ context[:attributes].each do |name, value|
40
+ metadata[name] = value
41
+ end
42
+ yield(metadata)
43
+
44
+ (context[:slides] || []).each_with_index do |slide, i|
45
+ text = slide[:text]
46
+ text_data = TextData.new(text, source_data: context[:data])
47
+ text_data["index"] = i
48
+ yield(text_data)
49
+ end
50
+ end
51
+
52
+ class SlidesListener
53
+ include REXML::SAX2Listener
54
+
55
+ TEXT_URI = "urn:oasis:names:tc:opendocument:xmlns:text:1.0"
56
+ DRAW_URI = "urn:oasis:names:tc:opendocument:xmlns:drawing:1.0"
57
+
58
+ def initialize(slides)
59
+ @slides = slides
60
+ @in_p = false
61
+ end
62
+
63
+ def start_element(uri, local_name, qname, attributes)
64
+ case uri
65
+ when TEXT_URI
66
+ case local_name
67
+ when "p"
68
+ @in_p = true
69
+ end
70
+ when DRAW_URI
71
+ case local_name
72
+ when "page"
73
+ @slides << {text: ""}
74
+ end
75
+ end
76
+ end
77
+
78
+ def end_element(uri, local_name, qname)
79
+ @in_p = false
80
+ case uri
81
+ when TEXT_URI
82
+ case local_name
83
+ when "p"
84
+ @slides.last[:text] << "\n"
85
+ end
86
+ end
87
+ end
88
+
89
+ def characters(text)
90
+ add_text(text)
91
+ end
92
+
93
+ def cdata(content)
94
+ add_text(content)
95
+ end
96
+
97
+ private
98
+ def add_text(text)
99
+ return unless @in_p
100
+ @slides.last[:text] << CGI.unescapeHTML(text)
101
+ end
102
+ end
103
+ end
104
+ end
105
+ end
@@ -0,0 +1,134 @@
1
+ # Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ require "chupa-text/decomposers/opendocument"
18
+
19
+ module ChupaText
20
+ module Decomposers
21
+ class OpenDocumentSpreadsheet < OpenDocument
22
+ registry.register("opendocument-spreadsheet", self)
23
+
24
+ def initialize(options={})
25
+ super
26
+ @extension = "ods"
27
+ @mime_type = "application/vnd.oasis.opendocument.spreadsheet"
28
+ end
29
+
30
+ private
31
+ def process_content(entry, context, &block)
32
+ context[:sheets] = []
33
+ listener = SheetsListener.new(context[:sheets])
34
+ parse(entry.file_data, listener)
35
+ end
36
+
37
+ def finish_decompose(context, &block)
38
+ metadata = TextData.new("", source_data: context[:data])
39
+ context[:attributes].each do |name, value|
40
+ metadata[name] = value
41
+ end
42
+ yield(metadata)
43
+
44
+ (context[:sheets] || []).each_with_index do |sheet, i|
45
+ text = sheet[:text]
46
+ text_data = TextData.new(text, source_data: context[:data])
47
+ text_data["index"] = i
48
+ name = sheet[:name]
49
+ text_data["name"] = name if name
50
+ yield(text_data)
51
+ end
52
+ end
53
+
54
+ class SheetsListener
55
+ include REXML::SAX2Listener
56
+
57
+ TEXT_URI = "urn:oasis:names:tc:opendocument:xmlns:text:1.0"
58
+ TABLE_URI = "urn:oasis:names:tc:opendocument:xmlns:table:1.0"
59
+
60
+ def initialize(sheets)
61
+ @sheets = sheets
62
+ @prefix_to_uri = {}
63
+ @uri_to_prefix = {}
64
+ @in_p = false
65
+ end
66
+
67
+ def start_prefix_mapping(prefix, uri)
68
+ @prefix_to_uri[prefix] = uri
69
+ @uri_to_prefix[uri] = prefix
70
+ end
71
+
72
+ def end_prefix_mapping(prefix)
73
+ uri = @prefix_to_uri.delete(prefix)
74
+ @uri_to_prefix.delete(uri)
75
+ end
76
+
77
+ def start_element(uri, local_name, qname, attributes)
78
+ case uri
79
+ when TEXT_URI
80
+ case local_name
81
+ when "p"
82
+ @in_p = true
83
+ end
84
+ when TABLE_URI
85
+ table_prefix = @uri_to_prefix[TABLE_URI]
86
+ case local_name
87
+ when "table"
88
+ @sheets << {
89
+ name: attributes["#{table_prefix}:name"],
90
+ rows: [],
91
+ }
92
+ when "table-row"
93
+ @sheets.last[:rows] << []
94
+ when "table-cell"
95
+ @sheets.last[:rows].last << {text: ""}
96
+ end
97
+ end
98
+ end
99
+
100
+ def end_element(uri, local_name, qname)
101
+ @in_p = false
102
+ case uri
103
+ when TABLE_URI
104
+ case local_name
105
+ when "table"
106
+ sheet = @sheets.last
107
+ text = ""
108
+ sheet[:rows].each do |row|
109
+ cell_texts = row.collect {|cell| cell[:text]}
110
+ next if cell_texts.all?(&:empty?)
111
+ text << cell_texts.join("\t") << "\n"
112
+ end
113
+ sheet[:text] = text
114
+ end
115
+ end
116
+ end
117
+
118
+ def characters(text)
119
+ add_text(text)
120
+ end
121
+
122
+ def cdata(content)
123
+ add_text(content)
124
+ end
125
+
126
+ private
127
+ def add_text(text)
128
+ return unless @in_p
129
+ @sheets.last[:rows].last.last[:text] << CGI.unescapeHTML(text)
130
+ end
131
+ end
132
+ end
133
+ end
134
+ end
@@ -0,0 +1,89 @@
1
+ # Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ require "chupa-text/decomposers/opendocument"
18
+
19
+ module ChupaText
20
+ module Decomposers
21
+ class OpenDocumentText < OpenDocument
22
+ registry.register("opendocument-text", self)
23
+
24
+ def initialize(options={})
25
+ super
26
+ @extension = "odt"
27
+ @mime_type = "application/vnd.oasis.opendocument.text"
28
+ end
29
+
30
+ private
31
+ def process_content(entry, context, &block)
32
+ context[:text] = ""
33
+ listener = TextListener.new(context[:text])
34
+ parse(entry.file_data, listener)
35
+ end
36
+
37
+ def finish_decompose(context, &block)
38
+ text_data = TextData.new(context[:text] || "",
39
+ source_data: context[:data])
40
+ context[:attributes].each do |name, value|
41
+ text_data[name] = value
42
+ end
43
+ yield(text_data)
44
+ end
45
+
46
+ class TextListener
47
+ include REXML::SAX2Listener
48
+
49
+ TEXT_URI = "urn:oasis:names:tc:opendocument:xmlns:text:1.0"
50
+ def initialize(output)
51
+ @output = output
52
+ @in_p = false
53
+ end
54
+
55
+ def start_element(uri, local_name, qname, attributes)
56
+ return unless uri == TEXT_URI
57
+ case local_name
58
+ when "p"
59
+ @in_p = true
60
+ end
61
+ end
62
+
63
+ def end_element(uri, local_name, qname)
64
+ @in_p = false
65
+
66
+ return unless uri == TEXT_URI
67
+ case local_name
68
+ when "p"
69
+ @output << "\n"
70
+ end
71
+ end
72
+
73
+ def characters(text)
74
+ add_text(text)
75
+ end
76
+
77
+ def cdata(content)
78
+ add_text(content)
79
+ end
80
+
81
+ private
82
+ def add_text(text)
83
+ return unless @in_p
84
+ @output << CGI.unescapeHTML(text)
85
+ end
86
+ end
87
+ end
88
+ end
89
+ end
@@ -0,0 +1,139 @@
1
+ # Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ require "cgi/util"
18
+ require "rexml/parsers/sax2parser"
19
+ require "rexml/sax2listener"
20
+
21
+ require "archive/zip"
22
+
23
+ module ChupaText
24
+ module Decomposers
25
+ class OpenDocument < Decomposer
26
+ def target?(data)
27
+ data.extension == @extension or
28
+ data.mime_type == @mime_type
29
+ end
30
+
31
+ def target_score(data)
32
+ if target?(data)
33
+ -1
34
+ else
35
+ nil
36
+ end
37
+ end
38
+
39
+ def decompose(data, &block)
40
+ context = {
41
+ data: data,
42
+ attributes: {},
43
+ }
44
+ data.open do |input|
45
+ Archive::Zip.open(input) do |zip|
46
+ zip.each do |entry|
47
+ next unless entry.file?
48
+ case entry.zip_path
49
+ when "content.xml"
50
+ process_content(entry, context, &block)
51
+ when "meta.xml"
52
+ process_meta(entry, context, &block)
53
+ end
54
+ end
55
+ end
56
+ end
57
+ finish_decompose(context, &block)
58
+ end
59
+
60
+ private
61
+ def parse(io, listener)
62
+ source = REXML::Source.new(io.read)
63
+ parser = REXML::Parsers::SAX2Parser.new(source)
64
+ parser.listen(listener)
65
+ parser.parse
66
+ end
67
+
68
+ def process_meta(entry, context, &block)
69
+ listener = AttributesListener.new(context[:attributes])
70
+ parse(entry.file_data, listener)
71
+ end
72
+
73
+ class AttributesListener
74
+ include REXML::SAX2Listener
75
+
76
+ META_URI = "urn:oasis:names:tc:opendocument:xmlns:meta:1.0"
77
+ DUBLIN_CORE_URI = "http://purl.org/dc/elements/1.1/"
78
+
79
+ def initialize(attributes)
80
+ @attributes = attributes
81
+ @name = nil
82
+ @type = nil
83
+ end
84
+
85
+ def start_element(uri, local_name, qname, attributes)
86
+ case uri
87
+ when META_URI
88
+ case local_name
89
+ when "creation-date"
90
+ @name = "created_time"
91
+ @type = :w3cdtf
92
+ when "keyword"
93
+ @name = "keywords"
94
+ @type = :array
95
+ when "generator"
96
+ @name = local_name
97
+ end
98
+ when DUBLIN_CORE_URI
99
+ case local_name
100
+ when "date"
101
+ @name = "modified_time"
102
+ @type = :w3cdtf
103
+ when "description", "title", "subject"
104
+ @name = local_name
105
+ end
106
+ end
107
+ end
108
+
109
+ def end_element(uri, local_name, qname)
110
+ @name = nil
111
+ @type = nil
112
+ end
113
+
114
+ def characters(text)
115
+ set_attribute(text)
116
+ end
117
+
118
+ def cdata(content)
119
+ set_attribute(content)
120
+ end
121
+
122
+ def set_attribute(value)
123
+ return if @name.nil?
124
+
125
+ value = CGI.unescapeHTML(value)
126
+ case @type
127
+ when :w3cdtf
128
+ value = Time.xmlschema(value)
129
+ when :array
130
+ values = @attributes[@name] || []
131
+ values << value
132
+ value = values
133
+ end
134
+ @attributes[@name] = value
135
+ end
136
+ end
137
+ end
138
+ end
139
+ end