chupa-text 1.1.4 → 1.1.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ce1def40525d7278aa45cdbe3af69cd95656fa58d1d02e6ddb3e2677940ed7d6
4
- data.tar.gz: 4ec184c4bd0f61508d4b1908e7c89abd8aeb01026c8b3590b404ca672887c6a1
3
+ metadata.gz: 65f2eb5be135f3db4a0cbb81bc4a268bcb752934da6d9aa32740842bb4cc674f
4
+ data.tar.gz: 72f8487dc57b67293e001621f6b63ece4680cf1069d9f98e3daa89708571766b
5
5
  SHA512:
6
- metadata.gz: ae43e4354761a953f61cda5348524f44346996133e4f77a310bfbfd07295b4548d41847f8132387ec9ade30c44a9c2bdd7936c7633085534ce03bb5db6f9061f
7
- data.tar.gz: f9174e01e21a2dbbc1647d191084969e9fd0cf4dc6f24a469ac7c6c4f2378d7a11946e6b2350e8df86ad5d007c6f048c804d4c1c2b79e95628511383c76c6061
6
+ metadata.gz: 32149617d9c921de856aa9fb3856a8a29fde5e9e1872a16241235499b1b8e014941928fb040ad60d52c15cbc4b69d84a9860bd432074964865e7f1e85d673a1e
7
+ data.tar.gz: 85c2820468547c9b34fef27faa2db0fc9d8208a266949177b2e4f2269bfadcd8f5b534c0cdb4aff9e28360c1509dd6b2d06ef72c7d1e268f399e97595a133a2c
data/chupa-text.gemspec CHANGED
@@ -53,8 +53,9 @@ Gem::Specification.new do |spec|
53
53
  spec.add_runtime_dependency("archive-zip")
54
54
 
55
55
  spec.add_development_dependency("bundler")
56
- spec.add_development_dependency("rake")
57
- spec.add_development_dependency("test-unit")
56
+ spec.add_development_dependency("nokogiri")
58
57
  spec.add_development_dependency("packnga")
58
+ spec.add_development_dependency("rake")
59
59
  spec.add_development_dependency("redcarpet")
60
+ spec.add_development_dependency("test-unit")
60
61
  end
data/doc/text/news.md CHANGED
@@ -1,5 +1,11 @@
1
1
  # News
2
2
 
3
+ ## 1.1.5: 2019-02-28
4
+
5
+ ### Improvements
6
+
7
+ * Added support for Nokogiri as an alternative SAX parser.
8
+
3
9
  ## 1.1.4: 2019-02-26
4
10
 
5
11
  ### Improvements
@@ -71,9 +71,7 @@ module ChupaText
71
71
  sheet_texts.join("\n")
72
72
  end
73
73
 
74
- class SheetListener
75
- include REXML::SAX2Listener
76
-
74
+ class SheetListener < SAXListener
77
75
  URI = "http://schemas.openxmlformats.org/spreadsheetml/2006/main"
78
76
 
79
77
  def initialize(sheet)
@@ -14,12 +14,10 @@
14
14
  # License along with this library; if not, write to the Free Software
15
15
  # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
16
 
17
- require "cgi/util"
18
- require "rexml/parsers/sax2parser"
19
- require "rexml/sax2listener"
20
-
21
17
  require "archive/zip"
22
18
 
19
+ require "chupa-text/sax-parser"
20
+
23
21
  module ChupaText
24
22
  module Decomposers
25
23
  class OfficeOpenXML < Decomposer
@@ -67,10 +65,8 @@ module ChupaText
67
65
  end
68
66
 
69
67
  private
70
- def parse(io, listener)
71
- source = REXML::Source.new(io.read)
72
- parser = REXML::Parsers::SAX2Parser.new(source)
73
- parser.listen(listener)
68
+ def parse(input, listener)
69
+ parser = SAXParser.new(input, listener)
74
70
  parser.parse
75
71
  end
76
72
 
@@ -83,9 +79,7 @@ module ChupaText
83
79
  context[:text]
84
80
  end
85
81
 
86
- class TextListener
87
- include REXML::SAX2Listener
88
-
82
+ class TextListener < SAXListener
89
83
  def initialize(output, target_uri)
90
84
  @output = output
91
85
  @target_uri = target_uri
@@ -121,13 +115,11 @@ module ChupaText
121
115
  private
122
116
  def add_text(text)
123
117
  return unless @in_target
124
- @output << CGI.unescapeHTML(text)
118
+ @output << text
125
119
  end
126
120
  end
127
121
 
128
- class AttributesListener
129
- include REXML::SAX2Listener
130
-
122
+ class AttributesListener < SAXListener
131
123
  CORE_PROPERTIES_URI =
132
124
  "http://schemas.openxmlformats.org/package/2006/metadata/core-properties"
133
125
  EXTENDED_PROPERTIES_URI =
@@ -49,9 +49,7 @@ module ChupaText
49
49
  end
50
50
  end
51
51
 
52
- class SlidesListener
53
- include REXML::SAX2Listener
54
-
52
+ class SlidesListener < SAXListener
55
53
  TEXT_URI = "urn:oasis:names:tc:opendocument:xmlns:text:1.0"
56
54
  DRAW_URI = "urn:oasis:names:tc:opendocument:xmlns:drawing:1.0"
57
55
 
@@ -97,7 +95,7 @@ module ChupaText
97
95
  private
98
96
  def add_text(text)
99
97
  return unless @in_p
100
- @slides.last[:text] << CGI.unescapeHTML(text)
98
+ @slides.last[:text] << text
101
99
  end
102
100
  end
103
101
  end
@@ -51,9 +51,7 @@ module ChupaText
51
51
  end
52
52
  end
53
53
 
54
- class SheetsListener
55
- include REXML::SAX2Listener
56
-
54
+ class SheetsListener < SAXListener
57
55
  TEXT_URI = "urn:oasis:names:tc:opendocument:xmlns:text:1.0"
58
56
  TABLE_URI = "urn:oasis:names:tc:opendocument:xmlns:table:1.0"
59
57
 
@@ -126,7 +124,7 @@ module ChupaText
126
124
  private
127
125
  def add_text(text)
128
126
  return unless @in_p
129
- @sheets.last[:rows].last.last[:text] << CGI.unescapeHTML(text)
127
+ @sheets.last[:rows].last.last[:text] << text
130
128
  end
131
129
  end
132
130
  end
@@ -43,9 +43,7 @@ module ChupaText
43
43
  yield(text_data)
44
44
  end
45
45
 
46
- class TextListener
47
- include REXML::SAX2Listener
48
-
46
+ class TextListener < SAXListener
49
47
  TEXT_URI = "urn:oasis:names:tc:opendocument:xmlns:text:1.0"
50
48
  def initialize(output)
51
49
  @output = output
@@ -81,7 +79,7 @@ module ChupaText
81
79
  private
82
80
  def add_text(text)
83
81
  return unless @in_p
84
- @output << CGI.unescapeHTML(text)
82
+ @output << text
85
83
  end
86
84
  end
87
85
  end
@@ -14,12 +14,10 @@
14
14
  # License along with this library; if not, write to the Free Software
15
15
  # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
16
 
17
- require "cgi/util"
18
- require "rexml/parsers/sax2parser"
19
- require "rexml/sax2listener"
20
-
21
17
  require "archive/zip"
22
18
 
19
+ require "chupa-text/sax-parser"
20
+
23
21
  module ChupaText
24
22
  module Decomposers
25
23
  class OpenDocument < Decomposer
@@ -58,10 +56,8 @@ module ChupaText
58
56
  end
59
57
 
60
58
  private
61
- def parse(io, listener)
62
- source = REXML::Source.new(io.read)
63
- parser = REXML::Parsers::SAX2Parser.new(source)
64
- parser.listen(listener)
59
+ def parse(input, listener)
60
+ parser = SAXParser.new(input, listener)
65
61
  parser.parse
66
62
  end
67
63
 
@@ -70,9 +66,7 @@ module ChupaText
70
66
  parse(entry.file_data, listener)
71
67
  end
72
68
 
73
- class AttributesListener
74
- include REXML::SAX2Listener
75
-
69
+ class AttributesListener < SAXListener
76
70
  META_URI = "urn:oasis:names:tc:opendocument:xmlns:meta:1.0"
77
71
  DUBLIN_CORE_URI = "http://purl.org/dc/elements/1.1/"
78
72
 
@@ -122,7 +116,6 @@ module ChupaText
122
116
  def set_attribute(value)
123
117
  return if @name.nil?
124
118
 
125
- value = CGI.unescapeHTML(value)
126
119
  case @type
127
120
  when :w3cdtf
128
121
  value = Time.xmlschema(value)
@@ -0,0 +1,151 @@
1
+ # Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ require "cgi/util"
18
+ require "rexml/parsers/sax2parser"
19
+ require "rexml/sax2listener"
20
+
21
+ begin
22
+ require "nokogiri"
23
+ rescue LoadError
24
+ end
25
+
26
+ module ChupaText
27
+ class SAXParser
28
+ class << self
29
+ def backend
30
+ case ENV["CHUPA_TEXT_SAX_PARSER_BACKEND"]
31
+ when "rexml"
32
+ :rexml
33
+ else
34
+ if Object.const_defined?(:Nokogiri)
35
+ :nokogiri
36
+ else
37
+ :rexml
38
+ end
39
+ end
40
+ end
41
+ end
42
+
43
+ def initialize(input, listener)
44
+ @input = input
45
+ @listener = listener
46
+ end
47
+
48
+ if backend == :nokogiri
49
+ def parse
50
+ document = Document.new(@listener)
51
+ parser = Nokogiri::XML::SAX::Parser.new(document)
52
+ parser.parse(@input)
53
+ end
54
+
55
+ class Document < Nokogiri::XML::SAX::Document
56
+ def initialize(listener)
57
+ @listener = listener
58
+ @namespaces_stack = []
59
+ end
60
+
61
+ def start_element_namespace(name,
62
+ attributes=[],
63
+ prefix=nil,
64
+ uri=nil,
65
+ namespaces=[])
66
+ namespaces.each do |namespace_prefix, namespace_uri|
67
+ @listener.start_prefix_mapping(namespace_prefix, namespace_uri)
68
+ end
69
+ attributes_hash = {}
70
+ attributes.each do |attribute|
71
+ attribute_qname = build_qname(attribute.prefix, attribute.localname)
72
+ attributes_hash[attribute_qname] = attribute.value
73
+ end
74
+ @namespaces_stack.push(namespaces)
75
+ @listener.start_element(uri,
76
+ name,
77
+ build_qname(prefix, name),
78
+ attributes_hash)
79
+ end
80
+
81
+ def end_element_namespace(name, prefix=nil, uri=nil)
82
+ @listener.end_element(uri, name, build_qname(prefix, name))
83
+ namespaces = @namespaces_stack.pop
84
+ namespaces.each do |namespace_prefix, _|
85
+ @listener.end_prefix_mapping(namespace_prefix)
86
+ end
87
+ end
88
+
89
+ def characters(text)
90
+ @listener.characters(text)
91
+ end
92
+
93
+ def cdata_block(content)
94
+ @listener.cdata(content)
95
+ end
96
+
97
+ private
98
+ def build_qname(prefix, local_name)
99
+ if prefix
100
+ "#{prefix}:#{local_name}"
101
+ else
102
+ local_name
103
+ end
104
+ end
105
+ end
106
+ else
107
+ def parse
108
+ source = REXML::Source.new(@input.read)
109
+ parser = REXML::Parsers::SAX2Parser.new(source)
110
+ parser.listen(Listener.new(@listener))
111
+ parser.parse
112
+ end
113
+
114
+ class Listener
115
+ include REXML::SAX2Listener
116
+
117
+ def initialize(listener)
118
+ @listener = listener
119
+ end
120
+
121
+ def start_prefix_mapping(*args)
122
+ @listener.start_prefix_mapping(*args)
123
+ end
124
+
125
+ def end_prefix_mapping(*args)
126
+ @listener.end_prefix_mapping(*args)
127
+ end
128
+
129
+ def start_element(*args)
130
+ @listener.start_element(*args)
131
+ end
132
+
133
+ def end_element(*args)
134
+ @listener.end_element(*args)
135
+ end
136
+
137
+ def characters(text)
138
+ @listener.characters(CGI.unescapeHTML(text))
139
+ end
140
+
141
+ def cdata(content)
142
+ @listener.cdata(CGI.unescapeHTML(content))
143
+ end
144
+ end
145
+ end
146
+ end
147
+
148
+ class SAXListener
149
+ include REXML::SAX2Listener
150
+ end
151
+ end
@@ -1,4 +1,4 @@
1
- # Copyright (C) 2013-2018 Kouhei Sutou <kou@clear-code.com>
1
+ # Copyright (C) 2013-2019 Kouhei Sutou <kou@clear-code.com>
2
2
  #
3
3
  # This library is free software; you can redistribute it and/or
4
4
  # modify it under the terms of the GNU Lesser General Public
@@ -15,5 +15,5 @@
15
15
  # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
16
 
17
17
  module ChupaText
18
- VERSION = "1.1.4"
18
+ VERSION = "1.1.5"
19
19
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: chupa-text
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.4
4
+ version: 1.1.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kouhei Sutou
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-02-26 00:00:00.000000000 Z
11
+ date: 2019-02-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: archive-zip
@@ -39,7 +39,7 @@ dependencies:
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
41
  - !ruby/object:Gem::Dependency
42
- name: rake
42
+ name: nokogiri
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
45
  - - ">="
@@ -53,7 +53,7 @@ dependencies:
53
53
  - !ruby/object:Gem::Version
54
54
  version: '0'
55
55
  - !ruby/object:Gem::Dependency
56
- name: test-unit
56
+ name: packnga
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
59
  - - ">="
@@ -67,7 +67,7 @@ dependencies:
67
67
  - !ruby/object:Gem::Version
68
68
  version: '0'
69
69
  - !ruby/object:Gem::Dependency
70
- name: packnga
70
+ name: rake
71
71
  requirement: !ruby/object:Gem::Requirement
72
72
  requirements:
73
73
  - - ">="
@@ -94,6 +94,20 @@ dependencies:
94
94
  - - ">="
95
95
  - !ruby/object:Gem::Version
96
96
  version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: test-unit
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
97
111
  description: ''
98
112
  email:
99
113
  - kou@clear-code.com
@@ -156,6 +170,7 @@ files:
156
170
  - lib/chupa-text/logger.rb
157
171
  - lib/chupa-text/mime-type-registry.rb
158
172
  - lib/chupa-text/mime-type.rb
173
+ - lib/chupa-text/sax-parser.rb
159
174
  - lib/chupa-text/screenshot.rb
160
175
  - lib/chupa-text/size-parser.rb
161
176
  - lib/chupa-text/text-data.rb