plaintext 0.3.0 → 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: eb87c916a0f34e20251b5a97cbcf1e354fc046b0
4
- data.tar.gz: e6dff85b7b0cec1ae3fdebbaabb950be1b563215
3
+ metadata.gz: 857600eeedad2b6b305655743d6dd4768faa5e5d
4
+ data.tar.gz: 17c78e4c90da3f07f5637c92a0891feb54b25b5b
5
5
  SHA512:
6
- metadata.gz: 48be94f65a07c3e400bacfae575758494afbf3d6004b19a99f7838a032b26d4503ae3ea47b46548de0dd34b4892c551d5d7af3620a72375ae9a771afc0601cc2
7
- data.tar.gz: 0b5dc9aa9abae205083d3dd4dcc89572bf36361fee098c6145472735ac6d10c810b07595be095565baa18b99534b61bbbc63b4e26c445ccdac94a3468b6c962c
6
+ metadata.gz: 436883566c3f9e1598a3f482c9146195646aa4e5d123aaedfe15876bb2876e93dd4c5bc964c515ece9b5f4e93b25974745bc176bd1aafb51aa4c64084221c7d5
7
+ data.tar.gz: 92969693830e20b3a1fb842bb9e60a5ed63a93f87903da8c7c88466df7a62e14dec0bb03f457ba7b28e3ce47bf82da3c9cc1a22fae4555a6cba6dce9ebc48405
data/CHANGELOG CHANGED
@@ -6,6 +6,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
6
6
 
7
7
  ## [Unreleased]
8
8
 
9
+ ## [0.3.1] - 2019-01-16
10
+
11
+ ### Added
12
+ - The max_plaintext_bytes limit introduced in 0.3.0 is now also enforced in the
13
+ zipped XML handlers responsible for office document parsing.
14
+
9
15
  ## [0.3.0] - 2019-01-09
10
16
 
11
17
  ### Added
@@ -9,16 +9,26 @@ module Plaintext
9
9
  class SaxDocument < Nokogiri::XML::SAX::Document
10
10
  attr_reader :text
11
11
 
12
- def initialize(text_element, text_namespace)
12
+ def initialize(text_element, text_namespace, max_size = nil)
13
13
  @element = text_element
14
14
  @namespace_uri = text_namespace
15
+ @max_size = max_size
16
+
15
17
  @text = ''.dup
16
18
  @is_text = false
17
19
  end
18
20
 
21
+ def text_length_exceeded?
22
+ @max_size && (@text.length > @max_size)
23
+ end
24
+
25
+
19
26
  # Handle each element, expecting the name and any attributes
20
27
  def start_element_namespace(name, attrs = [], prefix = nil, uri = nil, ns = [])
21
- if name == @element and uri == @namespace_uri
28
+ if name == @element and
29
+ uri == @namespace_uri and
30
+ !text_length_exceeded?
31
+
22
32
  @is_text = true
23
33
  end
24
34
  end
@@ -30,7 +40,10 @@ module Plaintext
30
40
 
31
41
  # Given the name of an element once its closing tag is reached
32
42
  def end_element_namespace(name, prefix = nil, uri = nil)
33
- if name == @element and uri == @namespace_uri
43
+ if name == @element and
44
+ uri == @namespace_uri and
45
+ @is_text
46
+
34
47
  @text << ' '
35
48
  @is_text = false
36
49
  end
@@ -38,10 +51,11 @@ module Plaintext
38
51
  end
39
52
 
40
53
  def text(file, options = {})
54
+ max_size = options[:max_size]
41
55
  Zip::File.open(file) do |zip_file|
42
56
  zip_file.each do |entry|
43
57
  if entry.name == @file_name
44
- return xml_to_text entry.get_input_stream
58
+ return xml_to_text entry.get_input_stream, max_size
45
59
  end
46
60
  end
47
61
  end
@@ -49,10 +63,11 @@ module Plaintext
49
63
 
50
64
  private
51
65
 
52
- def xml_to_text(io)
53
- sax_doc = SaxDocument.new @element, @namespace_uri
66
+ def xml_to_text(io, max_size)
67
+ sax_doc = SaxDocument.new @element, @namespace_uri, max_size
54
68
  Nokogiri::XML::SAX::Parser.new(sax_doc).parse(io)
55
- sax_doc.text
69
+ text = sax_doc.text
70
+ max_size.present? ? text[0, max_size] : text
56
71
  end
57
72
  end
58
73
  end
@@ -15,16 +15,23 @@ module Plaintext
15
15
  end
16
16
 
17
17
  def text(file, options = {})
18
+ max_size = options[:max_size]
18
19
  slides = []
20
+ result = ''.dup
19
21
  Zip::File.open(file) do |zip_file|
20
22
  zip_file.each do |entry|
21
23
  if entry.name =~ /slide(\d+)\.xml/
22
- slides << [$1, xml_to_text(entry.get_input_stream)]
24
+ slides << [$1, entry]
23
25
  end
24
26
  end
27
+
28
+ slides.sort!{|a, b| a.first <=> b.first}
29
+ slides.each do |id, entry|
30
+ result << xml_to_text(entry.get_input_stream, max_size)
31
+ break if max_size and result.length >= max_size
32
+ end
25
33
  end
26
- slides.sort!{|a, b| a.first <=> b.first}
27
- slides.map(&:last).join ' '
34
+ return result
28
35
  end
29
36
  end
30
37
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Plaintext
4
- VERSION = "0.3.0"
4
+ VERSION = "0.3.1"
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: plaintext
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jens Krämer
@@ -10,7 +10,7 @@ authors:
10
10
  autorequire:
11
11
  bindir: exe
12
12
  cert_chain: []
13
- date: 2019-01-09 00:00:00.000000000 Z
13
+ date: 2019-01-16 00:00:00.000000000 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: rubyzip
@@ -155,7 +155,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
155
155
  version: '0'
156
156
  requirements: []
157
157
  rubyforge_project:
158
- rubygems_version: 2.5.2.1
158
+ rubygems_version: 2.4.5.5
159
159
  signing_key:
160
160
  specification_version: 4
161
161
  summary: Extract plain text from most common office documents.