chupa-text 1.1.4 → 1.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ce1def40525d7278aa45cdbe3af69cd95656fa58d1d02e6ddb3e2677940ed7d6
4
- data.tar.gz: 4ec184c4bd0f61508d4b1908e7c89abd8aeb01026c8b3590b404ca672887c6a1
3
+ metadata.gz: 65f2eb5be135f3db4a0cbb81bc4a268bcb752934da6d9aa32740842bb4cc674f
4
+ data.tar.gz: 72f8487dc57b67293e001621f6b63ece4680cf1069d9f98e3daa89708571766b
5
5
  SHA512:
6
- metadata.gz: ae43e4354761a953f61cda5348524f44346996133e4f77a310bfbfd07295b4548d41847f8132387ec9ade30c44a9c2bdd7936c7633085534ce03bb5db6f9061f
7
- data.tar.gz: f9174e01e21a2dbbc1647d191084969e9fd0cf4dc6f24a469ac7c6c4f2378d7a11946e6b2350e8df86ad5d007c6f048c804d4c1c2b79e95628511383c76c6061
6
+ metadata.gz: 32149617d9c921de856aa9fb3856a8a29fde5e9e1872a16241235499b1b8e014941928fb040ad60d52c15cbc4b69d84a9860bd432074964865e7f1e85d673a1e
7
+ data.tar.gz: 85c2820468547c9b34fef27faa2db0fc9d8208a266949177b2e4f2269bfadcd8f5b534c0cdb4aff9e28360c1509dd6b2d06ef72c7d1e268f399e97595a133a2c
data/chupa-text.gemspec CHANGED
@@ -53,8 +53,9 @@ Gem::Specification.new do |spec|
53
53
  spec.add_runtime_dependency("archive-zip")
54
54
 
55
55
  spec.add_development_dependency("bundler")
56
- spec.add_development_dependency("rake")
57
- spec.add_development_dependency("test-unit")
56
+ spec.add_development_dependency("nokogiri")
58
57
  spec.add_development_dependency("packnga")
58
+ spec.add_development_dependency("rake")
59
59
  spec.add_development_dependency("redcarpet")
60
+ spec.add_development_dependency("test-unit")
60
61
  end
data/doc/text/news.md CHANGED
@@ -1,5 +1,11 @@
1
1
  # News
2
2
 
3
+ ## 1.1.5: 2019-02-28
4
+
5
+ ### Improvements
6
+
7
+ * Added support for Nokogiri as an alternative SAX parser.
8
+
3
9
  ## 1.1.4: 2019-02-26
4
10
 
5
11
  ### Improvements
@@ -71,9 +71,7 @@ module ChupaText
71
71
  sheet_texts.join("\n")
72
72
  end
73
73
 
74
- class SheetListener
75
- include REXML::SAX2Listener
76
-
74
+ class SheetListener < SAXListener
77
75
  URI = "http://schemas.openxmlformats.org/spreadsheetml/2006/main"
78
76
 
79
77
  def initialize(sheet)
@@ -14,12 +14,10 @@
14
14
  # License along with this library; if not, write to the Free Software
15
15
  # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
16
 
17
- require "cgi/util"
18
- require "rexml/parsers/sax2parser"
19
- require "rexml/sax2listener"
20
-
21
17
  require "archive/zip"
22
18
 
19
+ require "chupa-text/sax-parser"
20
+
23
21
  module ChupaText
24
22
  module Decomposers
25
23
  class OfficeOpenXML < Decomposer
@@ -67,10 +65,8 @@ module ChupaText
67
65
  end
68
66
 
69
67
  private
70
- def parse(io, listener)
71
- source = REXML::Source.new(io.read)
72
- parser = REXML::Parsers::SAX2Parser.new(source)
73
- parser.listen(listener)
68
+ def parse(input, listener)
69
+ parser = SAXParser.new(input, listener)
74
70
  parser.parse
75
71
  end
76
72
 
@@ -83,9 +79,7 @@ module ChupaText
83
79
  context[:text]
84
80
  end
85
81
 
86
- class TextListener
87
- include REXML::SAX2Listener
88
-
82
+ class TextListener < SAXListener
89
83
  def initialize(output, target_uri)
90
84
  @output = output
91
85
  @target_uri = target_uri
@@ -121,13 +115,11 @@ module ChupaText
121
115
  private
122
116
  def add_text(text)
123
117
  return unless @in_target
124
- @output << CGI.unescapeHTML(text)
118
+ @output << text
125
119
  end
126
120
  end
127
121
 
128
- class AttributesListener
129
- include REXML::SAX2Listener
130
-
122
+ class AttributesListener < SAXListener
131
123
  CORE_PROPERTIES_URI =
132
124
  "http://schemas.openxmlformats.org/package/2006/metadata/core-properties"
133
125
  EXTENDED_PROPERTIES_URI =
@@ -49,9 +49,7 @@ module ChupaText
49
49
  end
50
50
  end
51
51
 
52
- class SlidesListener
53
- include REXML::SAX2Listener
54
-
52
+ class SlidesListener < SAXListener
55
53
  TEXT_URI = "urn:oasis:names:tc:opendocument:xmlns:text:1.0"
56
54
  DRAW_URI = "urn:oasis:names:tc:opendocument:xmlns:drawing:1.0"
57
55
 
@@ -97,7 +95,7 @@ module ChupaText
97
95
  private
98
96
  def add_text(text)
99
97
  return unless @in_p
100
- @slides.last[:text] << CGI.unescapeHTML(text)
98
+ @slides.last[:text] << text
101
99
  end
102
100
  end
103
101
  end
@@ -51,9 +51,7 @@ module ChupaText
51
51
  end
52
52
  end
53
53
 
54
- class SheetsListener
55
- include REXML::SAX2Listener
56
-
54
+ class SheetsListener < SAXListener
57
55
  TEXT_URI = "urn:oasis:names:tc:opendocument:xmlns:text:1.0"
58
56
  TABLE_URI = "urn:oasis:names:tc:opendocument:xmlns:table:1.0"
59
57
 
@@ -126,7 +124,7 @@ module ChupaText
126
124
  private
127
125
  def add_text(text)
128
126
  return unless @in_p
129
- @sheets.last[:rows].last.last[:text] << CGI.unescapeHTML(text)
127
+ @sheets.last[:rows].last.last[:text] << text
130
128
  end
131
129
  end
132
130
  end
@@ -43,9 +43,7 @@ module ChupaText
43
43
  yield(text_data)
44
44
  end
45
45
 
46
- class TextListener
47
- include REXML::SAX2Listener
48
-
46
+ class TextListener < SAXListener
49
47
  TEXT_URI = "urn:oasis:names:tc:opendocument:xmlns:text:1.0"
50
48
  def initialize(output)
51
49
  @output = output
@@ -81,7 +79,7 @@ module ChupaText
81
79
  private
82
80
  def add_text(text)
83
81
  return unless @in_p
84
- @output << CGI.unescapeHTML(text)
82
+ @output << text
85
83
  end
86
84
  end
87
85
  end
@@ -14,12 +14,10 @@
14
14
  # License along with this library; if not, write to the Free Software
15
15
  # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
16
 
17
- require "cgi/util"
18
- require "rexml/parsers/sax2parser"
19
- require "rexml/sax2listener"
20
-
21
17
  require "archive/zip"
22
18
 
19
+ require "chupa-text/sax-parser"
20
+
23
21
  module ChupaText
24
22
  module Decomposers
25
23
  class OpenDocument < Decomposer
@@ -58,10 +56,8 @@ module ChupaText
58
56
  end
59
57
 
60
58
  private
61
- def parse(io, listener)
62
- source = REXML::Source.new(io.read)
63
- parser = REXML::Parsers::SAX2Parser.new(source)
64
- parser.listen(listener)
59
+ def parse(input, listener)
60
+ parser = SAXParser.new(input, listener)
65
61
  parser.parse
66
62
  end
67
63
 
@@ -70,9 +66,7 @@ module ChupaText
70
66
  parse(entry.file_data, listener)
71
67
  end
72
68
 
73
- class AttributesListener
74
- include REXML::SAX2Listener
75
-
69
+ class AttributesListener < SAXListener
76
70
  META_URI = "urn:oasis:names:tc:opendocument:xmlns:meta:1.0"
77
71
  DUBLIN_CORE_URI = "http://purl.org/dc/elements/1.1/"
78
72
 
@@ -122,7 +116,6 @@ module ChupaText
122
116
  def set_attribute(value)
123
117
  return if @name.nil?
124
118
 
125
- value = CGI.unescapeHTML(value)
126
119
  case @type
127
120
  when :w3cdtf
128
121
  value = Time.xmlschema(value)
@@ -0,0 +1,151 @@
1
+ # Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ require "cgi/util"
18
+ require "rexml/parsers/sax2parser"
19
+ require "rexml/sax2listener"
20
+
21
+ begin
22
+ require "nokogiri"
23
+ rescue LoadError
24
+ end
25
+
26
+ module ChupaText
27
+ class SAXParser
28
+ class << self
29
+ def backend
30
+ case ENV["CHUPA_TEXT_SAX_PARSER_BACKEND"]
31
+ when "rexml"
32
+ :rexml
33
+ else
34
+ if Object.const_defined?(:Nokogiri)
35
+ :nokogiri
36
+ else
37
+ :rexml
38
+ end
39
+ end
40
+ end
41
+ end
42
+
43
+ def initialize(input, listener)
44
+ @input = input
45
+ @listener = listener
46
+ end
47
+
48
+ if backend == :nokogiri
49
+ def parse
50
+ document = Document.new(@listener)
51
+ parser = Nokogiri::XML::SAX::Parser.new(document)
52
+ parser.parse(@input)
53
+ end
54
+
55
+ class Document < Nokogiri::XML::SAX::Document
56
+ def initialize(listener)
57
+ @listener = listener
58
+ @namespaces_stack = []
59
+ end
60
+
61
+ def start_element_namespace(name,
62
+ attributes=[],
63
+ prefix=nil,
64
+ uri=nil,
65
+ namespaces=[])
66
+ namespaces.each do |namespace_prefix, namespace_uri|
67
+ @listener.start_prefix_mapping(namespace_prefix, namespace_uri)
68
+ end
69
+ attributes_hash = {}
70
+ attributes.each do |attribute|
71
+ attribute_qname = build_qname(attribute.prefix, attribute.localname)
72
+ attributes_hash[attribute_qname] = attribute.value
73
+ end
74
+ @namespaces_stack.push(namespaces)
75
+ @listener.start_element(uri,
76
+ name,
77
+ build_qname(prefix, name),
78
+ attributes_hash)
79
+ end
80
+
81
+ def end_element_namespace(name, prefix=nil, uri=nil)
82
+ @listener.end_element(uri, name, build_qname(prefix, name))
83
+ namespaces = @namespaces_stack.pop
84
+ namespaces.each do |namespace_prefix, _|
85
+ @listener.end_prefix_mapping(namespace_prefix)
86
+ end
87
+ end
88
+
89
+ def characters(text)
90
+ @listener.characters(text)
91
+ end
92
+
93
+ def cdata_block(content)
94
+ @listener.cdata(content)
95
+ end
96
+
97
+ private
98
+ def build_qname(prefix, local_name)
99
+ if prefix
100
+ "#{prefix}:#{local_name}"
101
+ else
102
+ local_name
103
+ end
104
+ end
105
+ end
106
+ else
107
+ def parse
108
+ source = REXML::Source.new(@input.read)
109
+ parser = REXML::Parsers::SAX2Parser.new(source)
110
+ parser.listen(Listener.new(@listener))
111
+ parser.parse
112
+ end
113
+
114
+ class Listener
115
+ include REXML::SAX2Listener
116
+
117
+ def initialize(listener)
118
+ @listener = listener
119
+ end
120
+
121
+ def start_prefix_mapping(*args)
122
+ @listener.start_prefix_mapping(*args)
123
+ end
124
+
125
+ def end_prefix_mapping(*args)
126
+ @listener.end_prefix_mapping(*args)
127
+ end
128
+
129
+ def start_element(*args)
130
+ @listener.start_element(*args)
131
+ end
132
+
133
+ def end_element(*args)
134
+ @listener.end_element(*args)
135
+ end
136
+
137
+ def characters(text)
138
+ @listener.characters(CGI.unescapeHTML(text))
139
+ end
140
+
141
+ def cdata(content)
142
+ @listener.cdata(CGI.unescapeHTML(content))
143
+ end
144
+ end
145
+ end
146
+ end
147
+
148
+ class SAXListener
149
+ include REXML::SAX2Listener
150
+ end
151
+ end
@@ -1,4 +1,4 @@
1
- # Copyright (C) 2013-2018 Kouhei Sutou <kou@clear-code.com>
1
+ # Copyright (C) 2013-2019 Kouhei Sutou <kou@clear-code.com>
2
2
  #
3
3
  # This library is free software; you can redistribute it and/or
4
4
  # modify it under the terms of the GNU Lesser General Public
@@ -15,5 +15,5 @@
15
15
  # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
16
 
17
17
  module ChupaText
18
- VERSION = "1.1.4"
18
+ VERSION = "1.1.5"
19
19
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: chupa-text
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.4
4
+ version: 1.1.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kouhei Sutou
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-02-26 00:00:00.000000000 Z
11
+ date: 2019-02-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: archive-zip
@@ -39,7 +39,7 @@ dependencies:
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
41
  - !ruby/object:Gem::Dependency
42
- name: rake
42
+ name: nokogiri
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
45
  - - ">="
@@ -53,7 +53,7 @@ dependencies:
53
53
  - !ruby/object:Gem::Version
54
54
  version: '0'
55
55
  - !ruby/object:Gem::Dependency
56
- name: test-unit
56
+ name: packnga
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
59
  - - ">="
@@ -67,7 +67,7 @@ dependencies:
67
67
  - !ruby/object:Gem::Version
68
68
  version: '0'
69
69
  - !ruby/object:Gem::Dependency
70
- name: packnga
70
+ name: rake
71
71
  requirement: !ruby/object:Gem::Requirement
72
72
  requirements:
73
73
  - - ">="
@@ -94,6 +94,20 @@ dependencies:
94
94
  - - ">="
95
95
  - !ruby/object:Gem::Version
96
96
  version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: test-unit
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
97
111
  description: ''
98
112
  email:
99
113
  - kou@clear-code.com
@@ -156,6 +170,7 @@ files:
156
170
  - lib/chupa-text/logger.rb
157
171
  - lib/chupa-text/mime-type-registry.rb
158
172
  - lib/chupa-text/mime-type.rb
173
+ - lib/chupa-text/sax-parser.rb
159
174
  - lib/chupa-text/screenshot.rb
160
175
  - lib/chupa-text/size-parser.rb
161
176
  - lib/chupa-text/text-data.rb