plaintext 0.3.0 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 857600eeedad2b6b305655743d6dd4768faa5e5d
|
4
|
+
data.tar.gz: 17c78e4c90da3f07f5637c92a0891feb54b25b5b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 436883566c3f9e1598a3f482c9146195646aa4e5d123aaedfe15876bb2876e93dd4c5bc964c515ece9b5f4e93b25974745bc176bd1aafb51aa4c64084221c7d5
|
7
|
+
data.tar.gz: 92969693830e20b3a1fb842bb9e60a5ed63a93f87903da8c7c88466df7a62e14dec0bb03f457ba7b28e3ce47bf82da3c9cc1a22fae4555a6cba6dce9ebc48405
|
data/CHANGELOG
CHANGED
@@ -6,6 +6,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
6
6
|
|
7
7
|
## [Unreleased]
|
8
8
|
|
9
|
+
## [0.3.1] - 2019-01-16
|
10
|
+
|
11
|
+
### Added
|
12
|
+
- The max_plaintext_bytes limit introduced in 0.3.0 is now also enforced in the
|
13
|
+
zipped XML handlers responsible for office document parsing.
|
14
|
+
|
9
15
|
## [0.3.0] - 2019-01-09
|
10
16
|
|
11
17
|
### Added
|
@@ -9,16 +9,26 @@ module Plaintext
|
|
9
9
|
class SaxDocument < Nokogiri::XML::SAX::Document
|
10
10
|
attr_reader :text
|
11
11
|
|
12
|
-
def initialize(text_element, text_namespace)
|
12
|
+
def initialize(text_element, text_namespace, max_size = nil)
|
13
13
|
@element = text_element
|
14
14
|
@namespace_uri = text_namespace
|
15
|
+
@max_size = max_size
|
16
|
+
|
15
17
|
@text = ''.dup
|
16
18
|
@is_text = false
|
17
19
|
end
|
18
20
|
|
21
|
+
def text_length_exceeded?
|
22
|
+
@max_size && (@text.length > @max_size)
|
23
|
+
end
|
24
|
+
|
25
|
+
|
19
26
|
# Handle each element, expecting the name and any attributes
|
20
27
|
def start_element_namespace(name, attrs = [], prefix = nil, uri = nil, ns = [])
|
21
|
-
if name == @element and
|
28
|
+
if name == @element and
|
29
|
+
uri == @namespace_uri and
|
30
|
+
!text_length_exceeded?
|
31
|
+
|
22
32
|
@is_text = true
|
23
33
|
end
|
24
34
|
end
|
@@ -30,7 +40,10 @@ module Plaintext
|
|
30
40
|
|
31
41
|
# Given the name of an element once its closing tag is reached
|
32
42
|
def end_element_namespace(name, prefix = nil, uri = nil)
|
33
|
-
if name == @element and
|
43
|
+
if name == @element and
|
44
|
+
uri == @namespace_uri and
|
45
|
+
@is_text
|
46
|
+
|
34
47
|
@text << ' '
|
35
48
|
@is_text = false
|
36
49
|
end
|
@@ -38,10 +51,11 @@ module Plaintext
|
|
38
51
|
end
|
39
52
|
|
40
53
|
def text(file, options = {})
|
54
|
+
max_size = options[:max_size]
|
41
55
|
Zip::File.open(file) do |zip_file|
|
42
56
|
zip_file.each do |entry|
|
43
57
|
if entry.name == @file_name
|
44
|
-
return xml_to_text entry.get_input_stream
|
58
|
+
return xml_to_text entry.get_input_stream, max_size
|
45
59
|
end
|
46
60
|
end
|
47
61
|
end
|
@@ -49,10 +63,11 @@ module Plaintext
|
|
49
63
|
|
50
64
|
private
|
51
65
|
|
52
|
-
def xml_to_text(io)
|
53
|
-
sax_doc = SaxDocument.new @element, @namespace_uri
|
66
|
+
def xml_to_text(io, max_size)
|
67
|
+
sax_doc = SaxDocument.new @element, @namespace_uri, max_size
|
54
68
|
Nokogiri::XML::SAX::Parser.new(sax_doc).parse(io)
|
55
|
-
sax_doc.text
|
69
|
+
text = sax_doc.text
|
70
|
+
max_size.present? ? text[0, max_size] : text
|
56
71
|
end
|
57
72
|
end
|
58
73
|
end
|
@@ -15,16 +15,23 @@ module Plaintext
|
|
15
15
|
end
|
16
16
|
|
17
17
|
def text(file, options = {})
|
18
|
+
max_size = options[:max_size]
|
18
19
|
slides = []
|
20
|
+
result = ''.dup
|
19
21
|
Zip::File.open(file) do |zip_file|
|
20
22
|
zip_file.each do |entry|
|
21
23
|
if entry.name =~ /slide(\d+)\.xml/
|
22
|
-
slides << [$1,
|
24
|
+
slides << [$1, entry]
|
23
25
|
end
|
24
26
|
end
|
27
|
+
|
28
|
+
slides.sort!{|a, b| a.first <=> b.first}
|
29
|
+
slides.each do |id, entry|
|
30
|
+
result << xml_to_text(entry.get_input_stream, max_size)
|
31
|
+
break if max_size and result.length >= max_size
|
32
|
+
end
|
25
33
|
end
|
26
|
-
|
27
|
-
slides.map(&:last).join ' '
|
34
|
+
return result
|
28
35
|
end
|
29
36
|
end
|
30
37
|
end
|
data/lib/plaintext/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: plaintext
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jens Krämer
|
@@ -10,7 +10,7 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: exe
|
12
12
|
cert_chain: []
|
13
|
-
date: 2019-01-
|
13
|
+
date: 2019-01-16 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: rubyzip
|
@@ -155,7 +155,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
155
155
|
version: '0'
|
156
156
|
requirements: []
|
157
157
|
rubyforge_project:
|
158
|
-
rubygems_version: 2.5.
|
158
|
+
rubygems_version: 2.4.5.5
|
159
159
|
signing_key:
|
160
160
|
specification_version: 4
|
161
161
|
summary: Extract plain text from most common office documents.
|