chupa-text 1.1.3 → 1.1.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/doc/text/news.md +12 -0
  3. data/lib/chupa-text/command/chupa-text.rb +7 -1
  4. data/lib/chupa-text/decomposer.rb +8 -0
  5. data/lib/chupa-text/decomposers/office-open-xml-document.rb +51 -0
  6. data/lib/chupa-text/decomposers/office-open-xml-presentation.rb +67 -0
  7. data/lib/chupa-text/decomposers/office-open-xml-workbook.rb +114 -0
  8. data/lib/chupa-text/decomposers/office-open-xml.rb +196 -0
  9. data/lib/chupa-text/decomposers/opendocument-presentation.rb +105 -0
  10. data/lib/chupa-text/decomposers/opendocument-spreadsheet.rb +134 -0
  11. data/lib/chupa-text/decomposers/opendocument-text.rb +89 -0
  12. data/lib/chupa-text/decomposers/opendocument.rb +139 -0
  13. data/lib/chupa-text/extractor.rb +8 -2
  14. data/lib/chupa-text/formatters/mime.rb +3 -2
  15. data/lib/chupa-text/version.rb +1 -1
  16. data/test/decomposers/test-office-open-xml-document.rb +144 -0
  17. data/test/decomposers/test-office-open-xml-presentation.rb +133 -0
  18. data/test/decomposers/test-office-open-xml-workbook.rb +138 -0
  19. data/test/decomposers/test-open-document-presentation.rb +136 -0
  20. data/test/decomposers/test-open-document-spreadsheet.rb +152 -0
  21. data/test/decomposers/test-open-document-text.rb +144 -0
  22. data/test/fixture/docx/attributes.docx +0 -0
  23. data/test/fixture/docx/multi-pages.docx +0 -0
  24. data/test/fixture/docx/one-page.docx +0 -0
  25. data/test/fixture/docx/special-characters.docx +0 -0
  26. data/test/fixture/odp/attributes.odp +0 -0
  27. data/test/fixture/odp/multi-slides.odp +0 -0
  28. data/test/fixture/odp/one-slide.odp +0 -0
  29. data/test/fixture/ods/attributes.ods +0 -0
  30. data/test/fixture/ods/multi-sheets.ods +0 -0
  31. data/test/fixture/ods/one-sheet.ods +0 -0
  32. data/test/fixture/odt/attributes.odt +0 -0
  33. data/test/fixture/odt/multi-pages.odt +0 -0
  34. data/test/fixture/odt/one-page.odt +0 -0
  35. data/test/fixture/odt/special-characters.odt +0 -0
  36. data/test/fixture/pptx/attributes.pptx +0 -0
  37. data/test/fixture/pptx/multi-slides.pptx +0 -0
  38. data/test/fixture/pptx/one-slide.pptx +0 -0
  39. data/test/fixture/xlsx/attributes.xlsx +0 -0
  40. data/test/fixture/xlsx/multi-sheets.xlsx +0 -0
  41. data/test/fixture/xlsx/one-sheet.xlsx +0 -0
  42. metadata +36 -2
@@ -0,0 +1,105 @@
1
+ # Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ require "chupa-text/decomposers/opendocument"
18
+
19
+ module ChupaText
20
+ module Decomposers
21
+ class OpenDocumentPresentation < OpenDocument
22
+ registry.register("opendocument-presentation", self)
23
+
24
+ def initialize(options={})
25
+ super
26
+ @extension = "odp"
27
+ @mime_type = "application/vnd.oasis.opendocument.presentation"
28
+ end
29
+
30
+ private
31
+ def process_content(entry, context, &block)
32
+ context[:slides] = []
33
+ listener = SlidesListener.new(context[:slides])
34
+ parse(entry.file_data, listener)
35
+ end
36
+
37
+ def finish_decompose(context, &block)
38
+ metadata = TextData.new("", source_data: context[:data])
39
+ context[:attributes].each do |name, value|
40
+ metadata[name] = value
41
+ end
42
+ yield(metadata)
43
+
44
+ (context[:slides] || []).each_with_index do |slide, i|
45
+ text = slide[:text]
46
+ text_data = TextData.new(text, source_data: context[:data])
47
+ text_data["index"] = i
48
+ yield(text_data)
49
+ end
50
+ end
51
+
52
+ class SlidesListener
53
+ include REXML::SAX2Listener
54
+
55
+ TEXT_URI = "urn:oasis:names:tc:opendocument:xmlns:text:1.0"
56
+ DRAW_URI = "urn:oasis:names:tc:opendocument:xmlns:drawing:1.0"
57
+
58
+ def initialize(slides)
59
+ @slides = slides
60
+ @in_p = false
61
+ end
62
+
63
+ def start_element(uri, local_name, qname, attributes)
64
+ case uri
65
+ when TEXT_URI
66
+ case local_name
67
+ when "p"
68
+ @in_p = true
69
+ end
70
+ when DRAW_URI
71
+ case local_name
72
+ when "page"
73
+ @slides << {text: ""}
74
+ end
75
+ end
76
+ end
77
+
78
+ def end_element(uri, local_name, qname)
79
+ @in_p = false
80
+ case uri
81
+ when TEXT_URI
82
+ case local_name
83
+ when "p"
84
+ @slides.last[:text] << "\n"
85
+ end
86
+ end
87
+ end
88
+
89
+ def characters(text)
90
+ add_text(text)
91
+ end
92
+
93
+ def cdata(content)
94
+ add_text(content)
95
+ end
96
+
97
+ private
98
+ def add_text(text)
99
+ return unless @in_p
100
+ @slides.last[:text] << CGI.unescapeHTML(text)
101
+ end
102
+ end
103
+ end
104
+ end
105
+ end
@@ -0,0 +1,134 @@
1
+ # Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ require "chupa-text/decomposers/opendocument"
18
+
19
+ module ChupaText
20
+ module Decomposers
21
+ class OpenDocumentSpreadsheet < OpenDocument
22
+ registry.register("opendocument-spreadsheet", self)
23
+
24
+ def initialize(options={})
25
+ super
26
+ @extension = "ods"
27
+ @mime_type = "application/vnd.oasis.opendocument.spreadsheet"
28
+ end
29
+
30
+ private
31
+ def process_content(entry, context, &block)
32
+ context[:sheets] = []
33
+ listener = SheetsListener.new(context[:sheets])
34
+ parse(entry.file_data, listener)
35
+ end
36
+
37
+ def finish_decompose(context, &block)
38
+ metadata = TextData.new("", source_data: context[:data])
39
+ context[:attributes].each do |name, value|
40
+ metadata[name] = value
41
+ end
42
+ yield(metadata)
43
+
44
+ (context[:sheets] || []).each_with_index do |sheet, i|
45
+ text = sheet[:text]
46
+ text_data = TextData.new(text, source_data: context[:data])
47
+ text_data["index"] = i
48
+ name = sheet[:name]
49
+ text_data["name"] = name if name
50
+ yield(text_data)
51
+ end
52
+ end
53
+
54
+ class SheetsListener
55
+ include REXML::SAX2Listener
56
+
57
+ TEXT_URI = "urn:oasis:names:tc:opendocument:xmlns:text:1.0"
58
+ TABLE_URI = "urn:oasis:names:tc:opendocument:xmlns:table:1.0"
59
+
60
+ def initialize(sheets)
61
+ @sheets = sheets
62
+ @prefix_to_uri = {}
63
+ @uri_to_prefix = {}
64
+ @in_p = false
65
+ end
66
+
67
+ def start_prefix_mapping(prefix, uri)
68
+ @prefix_to_uri[prefix] = uri
69
+ @uri_to_prefix[uri] = prefix
70
+ end
71
+
72
+ def end_prefix_mapping(prefix)
73
+ uri = @prefix_to_uri.delete(prefix)
74
+ @uri_to_prefix.delete(uri)
75
+ end
76
+
77
+ def start_element(uri, local_name, qname, attributes)
78
+ case uri
79
+ when TEXT_URI
80
+ case local_name
81
+ when "p"
82
+ @in_p = true
83
+ end
84
+ when TABLE_URI
85
+ table_prefix = @uri_to_prefix[TABLE_URI]
86
+ case local_name
87
+ when "table"
88
+ @sheets << {
89
+ name: attributes["#{table_prefix}:name"],
90
+ rows: [],
91
+ }
92
+ when "table-row"
93
+ @sheets.last[:rows] << []
94
+ when "table-cell"
95
+ @sheets.last[:rows].last << {text: ""}
96
+ end
97
+ end
98
+ end
99
+
100
+ def end_element(uri, local_name, qname)
101
+ @in_p = false
102
+ case uri
103
+ when TABLE_URI
104
+ case local_name
105
+ when "table"
106
+ sheet = @sheets.last
107
+ text = ""
108
+ sheet[:rows].each do |row|
109
+ cell_texts = row.collect {|cell| cell[:text]}
110
+ next if cell_texts.all?(&:empty?)
111
+ text << cell_texts.join("\t") << "\n"
112
+ end
113
+ sheet[:text] = text
114
+ end
115
+ end
116
+ end
117
+
118
+ def characters(text)
119
+ add_text(text)
120
+ end
121
+
122
+ def cdata(content)
123
+ add_text(content)
124
+ end
125
+
126
+ private
127
+ def add_text(text)
128
+ return unless @in_p
129
+ @sheets.last[:rows].last.last[:text] << CGI.unescapeHTML(text)
130
+ end
131
+ end
132
+ end
133
+ end
134
+ end
@@ -0,0 +1,89 @@
1
+ # Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ require "chupa-text/decomposers/opendocument"
18
+
19
+ module ChupaText
20
+ module Decomposers
21
+ class OpenDocumentText < OpenDocument
22
+ registry.register("opendocument-text", self)
23
+
24
+ def initialize(options={})
25
+ super
26
+ @extension = "odt"
27
+ @mime_type = "application/vnd.oasis.opendocument.text"
28
+ end
29
+
30
+ private
31
+ def process_content(entry, context, &block)
32
+ context[:text] = ""
33
+ listener = TextListener.new(context[:text])
34
+ parse(entry.file_data, listener)
35
+ end
36
+
37
+ def finish_decompose(context, &block)
38
+ text_data = TextData.new(context[:text] || "",
39
+ source_data: context[:data])
40
+ context[:attributes].each do |name, value|
41
+ text_data[name] = value
42
+ end
43
+ yield(text_data)
44
+ end
45
+
46
+ class TextListener
47
+ include REXML::SAX2Listener
48
+
49
+ TEXT_URI = "urn:oasis:names:tc:opendocument:xmlns:text:1.0"
50
+ def initialize(output)
51
+ @output = output
52
+ @in_p = false
53
+ end
54
+
55
+ def start_element(uri, local_name, qname, attributes)
56
+ return unless uri == TEXT_URI
57
+ case local_name
58
+ when "p"
59
+ @in_p = true
60
+ end
61
+ end
62
+
63
+ def end_element(uri, local_name, qname)
64
+ @in_p = false
65
+
66
+ return unless uri == TEXT_URI
67
+ case local_name
68
+ when "p"
69
+ @output << "\n"
70
+ end
71
+ end
72
+
73
+ def characters(text)
74
+ add_text(text)
75
+ end
76
+
77
+ def cdata(content)
78
+ add_text(content)
79
+ end
80
+
81
+ private
82
+ def add_text(text)
83
+ return unless @in_p
84
+ @output << CGI.unescapeHTML(text)
85
+ end
86
+ end
87
+ end
88
+ end
89
+ end
@@ -0,0 +1,139 @@
1
+ # Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ require "cgi/util"
18
+ require "rexml/parsers/sax2parser"
19
+ require "rexml/sax2listener"
20
+
21
+ require "archive/zip"
22
+
23
+ module ChupaText
24
+ module Decomposers
25
+ class OpenDocument < Decomposer
26
+ def target?(data)
27
+ data.extension == @extension or
28
+ data.mime_type == @mime_type
29
+ end
30
+
31
+ def target_score(data)
32
+ if target?(data)
33
+ -1
34
+ else
35
+ nil
36
+ end
37
+ end
38
+
39
+ def decompose(data, &block)
40
+ context = {
41
+ data: data,
42
+ attributes: {},
43
+ }
44
+ data.open do |input|
45
+ Archive::Zip.open(input) do |zip|
46
+ zip.each do |entry|
47
+ next unless entry.file?
48
+ case entry.zip_path
49
+ when "content.xml"
50
+ process_content(entry, context, &block)
51
+ when "meta.xml"
52
+ process_meta(entry, context, &block)
53
+ end
54
+ end
55
+ end
56
+ end
57
+ finish_decompose(context, &block)
58
+ end
59
+
60
+ private
61
+ def parse(io, listener)
62
+ source = REXML::Source.new(io.read)
63
+ parser = REXML::Parsers::SAX2Parser.new(source)
64
+ parser.listen(listener)
65
+ parser.parse
66
+ end
67
+
68
+ def process_meta(entry, context, &block)
69
+ listener = AttributesListener.new(context[:attributes])
70
+ parse(entry.file_data, listener)
71
+ end
72
+
73
+ class AttributesListener
74
+ include REXML::SAX2Listener
75
+
76
+ META_URI = "urn:oasis:names:tc:opendocument:xmlns:meta:1.0"
77
+ DUBLIN_CORE_URI = "http://purl.org/dc/elements/1.1/"
78
+
79
+ def initialize(attributes)
80
+ @attributes = attributes
81
+ @name = nil
82
+ @type = nil
83
+ end
84
+
85
+ def start_element(uri, local_name, qname, attributes)
86
+ case uri
87
+ when META_URI
88
+ case local_name
89
+ when "creation-date"
90
+ @name = "created_time"
91
+ @type = :w3cdtf
92
+ when "keyword"
93
+ @name = "keywords"
94
+ @type = :array
95
+ when "generator"
96
+ @name = local_name
97
+ end
98
+ when DUBLIN_CORE_URI
99
+ case local_name
100
+ when "date"
101
+ @name = "modified_time"
102
+ @type = :w3cdtf
103
+ when "description", "title", "subject"
104
+ @name = local_name
105
+ end
106
+ end
107
+ end
108
+
109
+ def end_element(uri, local_name, qname)
110
+ @name = nil
111
+ @type = nil
112
+ end
113
+
114
+ def characters(text)
115
+ set_attribute(text)
116
+ end
117
+
118
+ def cdata(content)
119
+ set_attribute(content)
120
+ end
121
+
122
+ def set_attribute(value)
123
+ return if @name.nil?
124
+
125
+ value = CGI.unescapeHTML(value)
126
+ case @type
127
+ when :w3cdtf
128
+ value = Time.xmlschema(value)
129
+ when :array
130
+ values = @attributes[@name] || []
131
+ values << value
132
+ value = values
133
+ end
134
+ @attributes[@name] = value
135
+ end
136
+ end
137
+ end
138
+ end
139
+ end