plaintext 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: eb87c916a0f34e20251b5a97cbcf1e354fc046b0
4
- data.tar.gz: e6dff85b7b0cec1ae3fdebbaabb950be1b563215
3
+ metadata.gz: 857600eeedad2b6b305655743d6dd4768faa5e5d
4
+ data.tar.gz: 17c78e4c90da3f07f5637c92a0891feb54b25b5b
5
5
  SHA512:
6
- metadata.gz: 48be94f65a07c3e400bacfae575758494afbf3d6004b19a99f7838a032b26d4503ae3ea47b46548de0dd34b4892c551d5d7af3620a72375ae9a771afc0601cc2
7
- data.tar.gz: 0b5dc9aa9abae205083d3dd4dcc89572bf36361fee098c6145472735ac6d10c810b07595be095565baa18b99534b61bbbc63b4e26c445ccdac94a3468b6c962c
6
+ metadata.gz: 436883566c3f9e1598a3f482c9146195646aa4e5d123aaedfe15876bb2876e93dd4c5bc964c515ece9b5f4e93b25974745bc176bd1aafb51aa4c64084221c7d5
7
+ data.tar.gz: 92969693830e20b3a1fb842bb9e60a5ed63a93f87903da8c7c88466df7a62e14dec0bb03f457ba7b28e3ce47bf82da3c9cc1a22fae4555a6cba6dce9ebc48405
data/CHANGELOG CHANGED
@@ -6,6 +6,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
6
6
 
7
7
  ## [Unreleased]
8
8
 
9
+ ## [0.3.1] - 2019-01-16
10
+
11
+ ### Added
12
+ - The max_plaintext_bytes limit introduced in 0.3.0 is now also enforced in the
13
+ zipped XML handlers responsible for office document parsing.
14
+
9
15
  ## [0.3.0] - 2019-01-09
10
16
 
11
17
  ### Added
@@ -9,16 +9,26 @@ module Plaintext
9
9
  class SaxDocument < Nokogiri::XML::SAX::Document
10
10
  attr_reader :text
11
11
 
12
- def initialize(text_element, text_namespace)
12
+ def initialize(text_element, text_namespace, max_size = nil)
13
13
  @element = text_element
14
14
  @namespace_uri = text_namespace
15
+ @max_size = max_size
16
+
15
17
  @text = ''.dup
16
18
  @is_text = false
17
19
  end
18
20
 
21
+ def text_length_exceeded?
22
+ @max_size && (@text.length > @max_size)
23
+ end
24
+
25
+
19
26
  # Handle each element, expecting the name and any attributes
20
27
  def start_element_namespace(name, attrs = [], prefix = nil, uri = nil, ns = [])
21
- if name == @element and uri == @namespace_uri
28
+ if name == @element and
29
+ uri == @namespace_uri and
30
+ !text_length_exceeded?
31
+
22
32
  @is_text = true
23
33
  end
24
34
  end
@@ -30,7 +40,10 @@ module Plaintext
30
40
 
31
41
  # Given the name of an element once its closing tag is reached
32
42
  def end_element_namespace(name, prefix = nil, uri = nil)
33
- if name == @element and uri == @namespace_uri
43
+ if name == @element and
44
+ uri == @namespace_uri and
45
+ @is_text
46
+
34
47
  @text << ' '
35
48
  @is_text = false
36
49
  end
@@ -38,10 +51,11 @@ module Plaintext
38
51
  end
39
52
 
40
53
  def text(file, options = {})
54
+ max_size = options[:max_size]
41
55
  Zip::File.open(file) do |zip_file|
42
56
  zip_file.each do |entry|
43
57
  if entry.name == @file_name
44
- return xml_to_text entry.get_input_stream
58
+ return xml_to_text entry.get_input_stream, max_size
45
59
  end
46
60
  end
47
61
  end
@@ -49,10 +63,11 @@ module Plaintext
49
63
 
50
64
  private
51
65
 
52
- def xml_to_text(io)
53
- sax_doc = SaxDocument.new @element, @namespace_uri
66
+ def xml_to_text(io, max_size)
67
+ sax_doc = SaxDocument.new @element, @namespace_uri, max_size
54
68
  Nokogiri::XML::SAX::Parser.new(sax_doc).parse(io)
55
- sax_doc.text
69
+ text = sax_doc.text
70
+ max_size.present? ? text[0, max_size] : text
56
71
  end
57
72
  end
58
73
  end
@@ -15,16 +15,23 @@ module Plaintext
15
15
  end
16
16
 
17
17
  def text(file, options = {})
18
+ max_size = options[:max_size]
18
19
  slides = []
20
+ result = ''.dup
19
21
  Zip::File.open(file) do |zip_file|
20
22
  zip_file.each do |entry|
21
23
  if entry.name =~ /slide(\d+)\.xml/
22
- slides << [$1, xml_to_text(entry.get_input_stream)]
24
+ slides << [$1, entry]
23
25
  end
24
26
  end
27
+
28
+ slides.sort!{|a, b| a.first <=> b.first}
29
+ slides.each do |id, entry|
30
+ result << xml_to_text(entry.get_input_stream, max_size)
31
+ break if max_size and result.length >= max_size
32
+ end
25
33
  end
26
- slides.sort!{|a, b| a.first <=> b.first}
27
- slides.map(&:last).join ' '
34
+ return result
28
35
  end
29
36
  end
30
37
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Plaintext
4
- VERSION = "0.3.0"
4
+ VERSION = "0.3.1"
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: plaintext
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jens Krämer
@@ -10,7 +10,7 @@ authors:
10
10
  autorequire:
11
11
  bindir: exe
12
12
  cert_chain: []
13
- date: 2019-01-09 00:00:00.000000000 Z
13
+ date: 2019-01-16 00:00:00.000000000 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: rubyzip
@@ -155,7 +155,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
155
155
  version: '0'
156
156
  requirements: []
157
157
  rubyforge_project:
158
- rubygems_version: 2.5.2.1
158
+ rubygems_version: 2.4.5.5
159
159
  signing_key:
160
160
  specification_version: 4
161
161
  summary: Extract plain text from most common office documents.