plaintext 0.3.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 857600eeedad2b6b305655743d6dd4768faa5e5d
|
4
|
+
data.tar.gz: 17c78e4c90da3f07f5637c92a0891feb54b25b5b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 436883566c3f9e1598a3f482c9146195646aa4e5d123aaedfe15876bb2876e93dd4c5bc964c515ece9b5f4e93b25974745bc176bd1aafb51aa4c64084221c7d5
|
7
|
+
data.tar.gz: 92969693830e20b3a1fb842bb9e60a5ed63a93f87903da8c7c88466df7a62e14dec0bb03f457ba7b28e3ce47bf82da3c9cc1a22fae4555a6cba6dce9ebc48405
|
data/CHANGELOG
CHANGED
@@ -6,6 +6,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
6
6
|
|
7
7
|
## [Unreleased]
|
8
8
|
|
9
|
+
## [0.3.1] - 2019-01-16
|
10
|
+
|
11
|
+
### Added
|
12
|
+
- The max_plaintext_bytes limit introduced in 0.3.0 is now also enforced in the
|
13
|
+
zipped XML handlers responsible for office document parsing.
|
14
|
+
|
9
15
|
## [0.3.0] - 2019-01-09
|
10
16
|
|
11
17
|
### Added
|
@@ -9,16 +9,26 @@ module Plaintext
|
|
9
9
|
class SaxDocument < Nokogiri::XML::SAX::Document
|
10
10
|
attr_reader :text
|
11
11
|
|
12
|
-
def initialize(text_element, text_namespace)
|
12
|
+
def initialize(text_element, text_namespace, max_size = nil)
|
13
13
|
@element = text_element
|
14
14
|
@namespace_uri = text_namespace
|
15
|
+
@max_size = max_size
|
16
|
+
|
15
17
|
@text = ''.dup
|
16
18
|
@is_text = false
|
17
19
|
end
|
18
20
|
|
21
|
+
def text_length_exceeded?
|
22
|
+
@max_size && (@text.length > @max_size)
|
23
|
+
end
|
24
|
+
|
25
|
+
|
19
26
|
# Handle each element, expecting the name and any attributes
|
20
27
|
def start_element_namespace(name, attrs = [], prefix = nil, uri = nil, ns = [])
|
21
|
-
if name == @element and
|
28
|
+
if name == @element and
|
29
|
+
uri == @namespace_uri and
|
30
|
+
!text_length_exceeded?
|
31
|
+
|
22
32
|
@is_text = true
|
23
33
|
end
|
24
34
|
end
|
@@ -30,7 +40,10 @@ module Plaintext
|
|
30
40
|
|
31
41
|
# Given the name of an element once its closing tag is reached
|
32
42
|
def end_element_namespace(name, prefix = nil, uri = nil)
|
33
|
-
if name == @element and
|
43
|
+
if name == @element and
|
44
|
+
uri == @namespace_uri and
|
45
|
+
@is_text
|
46
|
+
|
34
47
|
@text << ' '
|
35
48
|
@is_text = false
|
36
49
|
end
|
@@ -38,10 +51,11 @@ module Plaintext
|
|
38
51
|
end
|
39
52
|
|
40
53
|
def text(file, options = {})
|
54
|
+
max_size = options[:max_size]
|
41
55
|
Zip::File.open(file) do |zip_file|
|
42
56
|
zip_file.each do |entry|
|
43
57
|
if entry.name == @file_name
|
44
|
-
return xml_to_text entry.get_input_stream
|
58
|
+
return xml_to_text entry.get_input_stream, max_size
|
45
59
|
end
|
46
60
|
end
|
47
61
|
end
|
@@ -49,10 +63,11 @@ module Plaintext
|
|
49
63
|
|
50
64
|
private
|
51
65
|
|
52
|
-
def xml_to_text(io)
|
53
|
-
sax_doc = SaxDocument.new @element, @namespace_uri
|
66
|
+
def xml_to_text(io, max_size)
|
67
|
+
sax_doc = SaxDocument.new @element, @namespace_uri, max_size
|
54
68
|
Nokogiri::XML::SAX::Parser.new(sax_doc).parse(io)
|
55
|
-
sax_doc.text
|
69
|
+
text = sax_doc.text
|
70
|
+
max_size.present? ? text[0, max_size] : text
|
56
71
|
end
|
57
72
|
end
|
58
73
|
end
|
@@ -15,16 +15,23 @@ module Plaintext
|
|
15
15
|
end
|
16
16
|
|
17
17
|
def text(file, options = {})
|
18
|
+
max_size = options[:max_size]
|
18
19
|
slides = []
|
20
|
+
result = ''.dup
|
19
21
|
Zip::File.open(file) do |zip_file|
|
20
22
|
zip_file.each do |entry|
|
21
23
|
if entry.name =~ /slide(\d+)\.xml/
|
22
|
-
slides << [$1,
|
24
|
+
slides << [$1, entry]
|
23
25
|
end
|
24
26
|
end
|
27
|
+
|
28
|
+
slides.sort!{|a, b| a.first <=> b.first}
|
29
|
+
slides.each do |id, entry|
|
30
|
+
result << xml_to_text(entry.get_input_stream, max_size)
|
31
|
+
break if max_size and result.length >= max_size
|
32
|
+
end
|
25
33
|
end
|
26
|
-
|
27
|
-
slides.map(&:last).join ' '
|
34
|
+
return result
|
28
35
|
end
|
29
36
|
end
|
30
37
|
end
|
data/lib/plaintext/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: plaintext
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jens Krämer
|
@@ -10,7 +10,7 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: exe
|
12
12
|
cert_chain: []
|
13
|
-
date: 2019-01-
|
13
|
+
date: 2019-01-16 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: rubyzip
|
@@ -155,7 +155,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
155
155
|
version: '0'
|
156
156
|
requirements: []
|
157
157
|
rubyforge_project:
|
158
|
-
rubygems_version: 2.5.
|
158
|
+
rubygems_version: 2.4.5.5
|
159
159
|
signing_key:
|
160
160
|
specification_version: 4
|
161
161
|
summary: Extract plain text from most common office documents.
|