plaintext 0.2.0 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: c04f6f06ab6a16b423c2a74ddb38685a49a23372
4
- data.tar.gz: 8dc49ce59bde01f25983b982323889a512fa900f
2
+ SHA256:
3
+ metadata.gz: fde4677879a9d3ce30844e9a5b0439ad2f708f2f532927f96b16d962bd4deb07
4
+ data.tar.gz: 900cf36c77f25e98876e9838f2e21a11244f37f059c6c272386fd8102cfd500d
5
5
  SHA512:
6
- metadata.gz: 13876311c960330abc0596ccc09a5eb172feca1101fa154386182bb04c7cfe6ca5f2b8fb4a408ffa658fbe0cbc616e29c8770c2a0680ec11064e950a75eae19d
7
- data.tar.gz: 50f4d40bcb4bfb257b4e280e081cea291a633203746c322c028e6dcacd53b68f0e5c50da6ede80bdeb0ae5a7c300c0ddb38694b4ee97d6ed8343ae81e5c8a69e
6
+ metadata.gz: c744870cc385445ae9a8ad68cfbc5e9489356f23750acdf7a0e8b54ec1ae9c674457c1f14e226aa8aee3bf542ba5fabeac524fda6ad0d89601414c0b77196e5a
7
+ data.tar.gz: e6c899c622b42856fd2671b0a7961a938e89389801904d70069b1b1735b8e6e374610f7f2b4d72fd08b16db1f24dcaa994e22b59f9e2a73585206f173b28a234
data/.travis.yml CHANGED
@@ -1,7 +1,7 @@
1
1
  language: ruby
2
2
  rvm:
3
- - 2.2.3
3
+ - 2.6.4
4
4
  before_install:
5
5
  - sudo apt-get -qq update
6
6
  - sudo apt-get install -y catdoc unrtf poppler-utils tesseract-ocr
7
- - gem install bundler -v 1.10.6
7
+ - gem install bundler -v 2.0.1
data/CHANGELOG CHANGED
@@ -6,9 +6,50 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
6
6
 
7
7
  ## [Unreleased]
8
8
 
9
+ ## [0.3.4] - 2021-04-21
10
+ - Further relax the rubyzip version requirement to allow 2.x versions
11
+ - Fix: Removed --quiet from unrtf arguments. In newer versions this seems to
12
+ suppress the header line which we used as a marker to strip unrtf comments and
13
+ meta data from the command output.
14
+
15
+ ## [0.3.3] - 2019-10-01
16
+ - Relax rubyzip dependency. Our usage of rubyzip does not require adressing
17
+ CVE-2019-16892 and thus is fine with 1.2, but applications using this Gem
18
+ might use RubyZIP elsewhere too and thus will want to upgrade to >= 1.3.
19
+
20
+ ## [0.3.2] - 2019-09-02
21
+ - Set minimum Nokogiri version to 1.10.4. See CVE-2019-5477.
22
+ - Fix encoding issues for PDFs.
23
+ - Bump development dependencies to bundler version 2 and
24
+ rake version 12
25
+ - Update travis file to use ruby 2.6.4 and bundler 2.0.1
26
+
27
+ ## [0.3.1] - 2019-01-16
28
+
29
+ ### Added
30
+ - The max_plaintext_bytes limit introduced in 0.3.0 is now also enforced in the
31
+ zipped XML handlers responsible for office document parsing.
32
+
33
+ ## [0.3.0] - 2019-01-09
34
+
35
+ ### Added
36
+ - `:max_plaintext_bytes` option to place an upper limit on the number of bytes
37
+ returned. Also limits the amount of data that is actually read from plain text
38
+ files and external command output to limit memory usage. Set this on the
39
+ resolver object.
40
+
41
+ ### Changed
42
+ - the unrtf handler now strips the preamble which unrtf adds to its output.
43
+ - text from external command handlers is now converted to UTF-8 before it's returned.
44
+ - the FileHandler `text` method signature has been changed by adding an options
45
+ hash to support passing the max_plaintext_bytes limit.
46
+
47
+
9
48
  ## [0.2.0] - 2018-12-22
49
+
10
50
  ### Changed
11
51
  - relaxed Nokogiri dependency to '~> 1.8' for Redmine 4 (which uses 1.9)
12
52
 
53
+
13
54
  ## [0.1.0] - 2018-02-15
14
55
  - Initial release
data/Gemfile CHANGED
@@ -1,4 +1,6 @@
1
1
  source 'https://rubygems.org'
2
2
 
3
+ gem "byebug"
4
+
3
5
  # Specify your gem's dependencies in plaintext.gemspec
4
- gemspec
6
+ gemspec
data/README.md CHANGED
@@ -109,6 +109,9 @@ catdoc:
109
109
  fulltext = Plaintext::Resolver.new(file, content_type).text
110
110
  ```
111
111
 
112
+ To limit the number of bytes returned (default is 4MB), set the
113
+ `max_plaintext_bytes` property on the resolver instance before calling `text`.
114
+
112
115
  ## License
113
116
 
114
117
  The `plaintext` gem is free software; you can redistribute it and/or modify it under the terms of the GNU General
@@ -11,5 +11,27 @@ module Plaintext
11
11
  false
12
12
  end
13
13
  end
14
+
15
+ # use `#set(max_size: 1.megabyte)` to give an upper limit of data to be read.
16
+ #
17
+ # By default, all data (whole file / command output) will be read which can
18
+ # be a problem with huge text files (eg SQL dumps)
19
+ def set(args = {})
20
+ options.update args
21
+ self
22
+ end
23
+
24
+ private
25
+
26
+ # maximum number of bytes to read from external command output or text
27
+ # files
28
+ def max_size
29
+ options[:max_size]
30
+ end
31
+
32
+ def options
33
+ @options ||= {}
34
+ end
35
+
14
36
  end
15
- end
37
+ end
@@ -11,23 +11,26 @@ module Plaintext
11
11
  # Due to how popen works the command will be executed directly without
12
12
  # involving the shell if cmd is an array.
13
13
  require 'fileutils'
14
+
15
+ FILE_PLACEHOLDER = '__FILE__'.freeze
16
+ DEFAULT_STREAM_ENCODING = 'ASCII-8BIT'.freeze
17
+
14
18
  def shellout(cmd, options = {}, &block)
15
19
  mode = "r+"
16
20
  IO.popen(cmd, mode) do |io|
17
- io.set_encoding("ASCII-8BIT") if io.respond_to?(:set_encoding)
21
+ set_stream_encoding(io)
18
22
  io.close_write unless options[:write_stdin]
19
23
  block.call(io) if block_given?
20
24
  end
21
25
  end
22
26
 
23
- FILE_PLACEHOLDER = '__FILE__'.freeze
24
-
25
- def text(file)
27
+ def text(file, options = {})
26
28
  cmd = @command.dup
27
29
  cmd[cmd.index(FILE_PLACEHOLDER)] = Pathname(file).to_s
28
- shellout(cmd){ |io| io.read }.to_s
30
+ shellout(cmd) { |io| read io, options[:max_size] }.to_s
29
31
  end
30
32
 
33
+
31
34
  def accept?(content_type)
32
35
  super and available?
33
36
  end
@@ -39,5 +42,33 @@ module Plaintext
39
42
  def self.available?
40
43
  new.available?
41
44
  end
45
+
46
+ protected
47
+
48
+ def utf8_stream?
49
+ false
50
+ end
51
+
52
+ private
53
+
54
+ def set_stream_encoding(io)
55
+ return unless io.respond_to?(:set_encoding)
56
+
57
+ if utf8_stream?
58
+ io.set_encoding('UTF-8'.freeze)
59
+ else
60
+ io.set_encoding(DEFAULT_STREAM_ENCODING)
61
+ end
62
+ end
63
+
64
+ def read(io, max_size = nil)
65
+ piece = io.read(max_size)
66
+
67
+ if utf8_stream?
68
+ piece
69
+ else
70
+ Plaintext::CodesetUtil.to_utf8 piece, DEFAULT_STREAM_ENCODING
71
+ end
72
+ end
42
73
  end
43
74
  end
@@ -5,9 +5,16 @@ module Plaintext
5
5
  DEFAULT = [
6
6
  '/usr/bin/pdftotext', '-enc', 'UTF-8', '__FILE__', '-'
7
7
  ].freeze
8
+
8
9
  def initialize
9
10
  @content_type = 'application/pdf'
10
11
  @command = Plaintext::Configuration['pdftotext'] || DEFAULT
11
12
  end
13
+
14
+ protected
15
+
16
+ def utf8_stream?
17
+ true
18
+ end
12
19
  end
13
20
  end
@@ -3,11 +3,34 @@
3
3
  module Plaintext
4
4
  class RtfHandler < ExternalCommandHandler
5
5
  DEFAULT = [
6
- '/usr/bin/unrtf', '--text', '__FILE__'
6
+ '/usr/bin/unrtf', '--nopict', '--text', '__FILE__'
7
7
  ].freeze
8
8
  def initialize
9
9
  @content_type = 'application/rtf'
10
10
  @command = Plaintext::Configuration['unrtf'] || DEFAULT
11
11
  end
12
+
13
+ private
14
+
15
+ UNRTF_HEADER = "### Translation from RTF performed by UnRTF"
16
+ END_MARKER = "-----------------\n"
17
+
18
+ def read(io, max_size = nil)
19
+ if line = io.read(UNRTF_HEADER.length)
20
+ string = if line.starts_with? UNRTF_HEADER
21
+ io.gets while $_ != END_MARKER
22
+ io.read max_size
23
+ else
24
+ if max_size.nil?
25
+ line + io.read
26
+ elsif max_size > UNRTF_HEADER.length
27
+ line + io.read(max_size - UNRTF_HEADER.length)
28
+ else
29
+ line[0,max_size]
30
+ end
31
+ end
32
+ Plaintext::CodesetUtil.to_utf8 string, "ASCII-8BIT"
33
+ end
34
+ end
12
35
  end
13
36
  end
@@ -1,14 +1,16 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Plaintext
4
+
4
5
  class PlaintextHandler < FileHandler
5
6
  CONTENT_TYPES = %w(text/csv text/plain)
6
7
  def initialize
7
8
  @content_types = CONTENT_TYPES
8
9
  end
9
10
 
10
- def text(file)
11
- Plaintext::CodesetUtil.to_utf8 IO.read(file), 'UTF-8'
11
+ def text(file, options = {})
12
+ max_size = options[:max_size]
13
+ Plaintext::CodesetUtil.to_utf8 IO.read(file, max_size), 'UTF-8'
12
14
  end
13
15
  end
14
- end
16
+ end
@@ -9,16 +9,26 @@ module Plaintext
9
9
  class SaxDocument < Nokogiri::XML::SAX::Document
10
10
  attr_reader :text
11
11
 
12
- def initialize(text_element, text_namespace)
12
+ def initialize(text_element, text_namespace, max_size = nil)
13
13
  @element = text_element
14
14
  @namespace_uri = text_namespace
15
+ @max_size = max_size
16
+
15
17
  @text = ''.dup
16
18
  @is_text = false
17
19
  end
18
20
 
21
+ def text_length_exceeded?
22
+ @max_size && (@text.length > @max_size)
23
+ end
24
+
25
+
19
26
  # Handle each element, expecting the name and any attributes
20
27
  def start_element_namespace(name, attrs = [], prefix = nil, uri = nil, ns = [])
21
- if name == @element and uri == @namespace_uri
28
+ if name == @element and
29
+ uri == @namespace_uri and
30
+ !text_length_exceeded?
31
+
22
32
  @is_text = true
23
33
  end
24
34
  end
@@ -30,18 +40,22 @@ module Plaintext
30
40
 
31
41
  # Given the name of an element once its closing tag is reached
32
42
  def end_element_namespace(name, prefix = nil, uri = nil)
33
- if name == @element and uri == @namespace_uri
43
+ if name == @element and
44
+ uri == @namespace_uri and
45
+ @is_text
46
+
34
47
  @text << ' '
35
48
  @is_text = false
36
49
  end
37
50
  end
38
51
  end
39
52
 
40
- def text(file)
53
+ def text(file, options = {})
54
+ max_size = options[:max_size]
41
55
  Zip::File.open(file) do |zip_file|
42
56
  zip_file.each do |entry|
43
57
  if entry.name == @file_name
44
- return xml_to_text entry.get_input_stream
58
+ return xml_to_text entry.get_input_stream, max_size
45
59
  end
46
60
  end
47
61
  end
@@ -49,10 +63,11 @@ module Plaintext
49
63
 
50
64
  private
51
65
 
52
- def xml_to_text(io)
53
- sax_doc = SaxDocument.new @element, @namespace_uri
66
+ def xml_to_text(io, max_size)
67
+ sax_doc = SaxDocument.new @element, @namespace_uri, max_size
54
68
  Nokogiri::XML::SAX::Parser.new(sax_doc).parse(io)
55
- sax_doc.text
69
+ text = sax_doc.text
70
+ max_size.present? ? text[0, max_size] : text
56
71
  end
57
72
  end
58
- end
73
+ end
@@ -14,17 +14,24 @@ module Plaintext
14
14
  @namespace_uri = 'http://schemas.openxmlformats.org/drawingml/2006/main'
15
15
  end
16
16
 
17
- def text(file)
17
+ def text(file, options = {})
18
+ max_size = options[:max_size]
18
19
  slides = []
20
+ result = ''.dup
19
21
  Zip::File.open(file) do |zip_file|
20
22
  zip_file.each do |entry|
21
23
  if entry.name =~ /slide(\d+)\.xml/
22
- slides << [$1, xml_to_text(entry.get_input_stream)]
24
+ slides << [$1, entry]
23
25
  end
24
26
  end
27
+
28
+ slides.sort!{|a, b| a.first <=> b.first}
29
+ slides.each do |id, entry|
30
+ result << xml_to_text(entry.get_input_stream, max_size)
31
+ break if max_size and result.length >= max_size
32
+ end
25
33
  end
26
- slides.sort!{|a, b| a.first <=> b.first}
27
- slides.map(&:last).join ' '
34
+ return result
28
35
  end
29
36
  end
30
- end
37
+ end
@@ -2,7 +2,9 @@
2
2
 
3
3
  module Plaintext
4
4
  class Resolver
5
- MAX_FULLTEXT_LENGTH = 4_194_304 # 4 megabytes
5
+
6
+ # maximum length of returned plain text in bytes. Default: 4MB
7
+ attr_accessor :max_plaintext_bytes
6
8
 
7
9
  class << self
8
10
  attr_accessor :cached_file_handlers
@@ -26,15 +28,19 @@ module Plaintext
26
28
  def initialize(file, content_type = nil)
27
29
  @file = file
28
30
  @content_type = content_type
31
+ @max_plaintext_bytes = 4_194_304 # 4 megabytes
29
32
  end
30
33
 
34
+
31
35
  # Returns the extracted fulltext or nil if no matching handler was found
32
36
  # for the file type.
33
37
  def text
34
- if handler = find_handler and text = handler.text(@file)
35
- text.gsub! /\s+/m, ' '
38
+ if handler = find_handler and
39
+ text = handler.text(@file, max_size: max_plaintext_bytes)
40
+
41
+ text.gsub!(/\s+/m, ' ')
36
42
  text.strip!
37
- text.mb_chars.compose.limit(MAX_FULLTEXT_LENGTH).to_s
43
+ text.mb_chars.compose.limit(max_plaintext_bytes).to_s
38
44
  end
39
45
  end
40
46
 
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Plaintext
4
- VERSION = "0.2.0"
4
+ VERSION = "0.3.4"
5
5
  end
data/plaintext.gemspec CHANGED
@@ -4,7 +4,7 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
4
  require 'plaintext/version'
5
5
 
6
6
  Gem::Specification.new do |spec|
7
- spec.name = "plaintext"
7
+ spec.name = 'plaintext'
8
8
  spec.version = Plaintext::VERSION
9
9
  spec.authors = ['Jens Krämer', 'Planio GmbH', 'OpenProject GmbH']
10
10
  spec.email = ['info@openproject.com']
@@ -12,17 +12,18 @@ Gem::Specification.new do |spec|
12
12
  spec.summary = 'Extract plain text from most common office documents.'
13
13
  spec.description = "Extract text from common office files. Based on the file's content type a command line tool is selected to do the job."
14
14
  spec.homepage = 'https://github.com/planio-gmbh/plaintext'
15
-
15
+ spec.license = 'GPL-2.0'
16
+
16
17
  spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
17
- spec.bindir = "exe"
18
+ spec.bindir = 'exe'
18
19
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
19
- spec.require_paths = ["lib"]
20
+ spec.require_paths = ['lib']
20
21
 
21
- spec.add_dependency 'rubyzip', '~> 1.2.1'
22
- spec.add_dependency 'nokogiri', '~> 1.8'
23
22
  spec.add_dependency 'activesupport', '>2.2.1 '
23
+ spec.add_dependency 'nokogiri', '~> 1.10', '>= 1.10.4'
24
+ spec.add_dependency 'rubyzip', '>= 1.2.0'
24
25
 
25
- spec.add_development_dependency "bundler", "~> 1.10"
26
- spec.add_development_dependency "rake", "~> 10.0"
27
- spec.add_development_dependency "rspec"
26
+ spec.add_development_dependency 'bundler', '~> 2.0'
27
+ spec.add_development_dependency 'rake', '~> 12.0'
28
+ spec.add_development_dependency 'rspec'
28
29
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: plaintext
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jens Krämer
@@ -10,78 +10,84 @@ authors:
10
10
  autorequire:
11
11
  bindir: exe
12
12
  cert_chain: []
13
- date: 2018-12-22 00:00:00.000000000 Z
13
+ date: 2021-04-21 00:00:00.000000000 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
- name: rubyzip
16
+ name: activesupport
17
17
  requirement: !ruby/object:Gem::Requirement
18
18
  requirements:
19
- - - "~>"
19
+ - - ">"
20
20
  - !ruby/object:Gem::Version
21
- version: 1.2.1
21
+ version: 2.2.1
22
22
  type: :runtime
23
23
  prerelease: false
24
24
  version_requirements: !ruby/object:Gem::Requirement
25
25
  requirements:
26
- - - "~>"
26
+ - - ">"
27
27
  - !ruby/object:Gem::Version
28
- version: 1.2.1
28
+ version: 2.2.1
29
29
  - !ruby/object:Gem::Dependency
30
30
  name: nokogiri
31
31
  requirement: !ruby/object:Gem::Requirement
32
32
  requirements:
33
33
  - - "~>"
34
34
  - !ruby/object:Gem::Version
35
- version: '1.8'
35
+ version: '1.10'
36
+ - - ">="
37
+ - !ruby/object:Gem::Version
38
+ version: 1.10.4
36
39
  type: :runtime
37
40
  prerelease: false
38
41
  version_requirements: !ruby/object:Gem::Requirement
39
42
  requirements:
40
43
  - - "~>"
41
44
  - !ruby/object:Gem::Version
42
- version: '1.8'
45
+ version: '1.10'
46
+ - - ">="
47
+ - !ruby/object:Gem::Version
48
+ version: 1.10.4
43
49
  - !ruby/object:Gem::Dependency
44
- name: activesupport
50
+ name: rubyzip
45
51
  requirement: !ruby/object:Gem::Requirement
46
52
  requirements:
47
- - - ">"
53
+ - - ">="
48
54
  - !ruby/object:Gem::Version
49
- version: 2.2.1
55
+ version: 1.2.0
50
56
  type: :runtime
51
57
  prerelease: false
52
58
  version_requirements: !ruby/object:Gem::Requirement
53
59
  requirements:
54
- - - ">"
60
+ - - ">="
55
61
  - !ruby/object:Gem::Version
56
- version: 2.2.1
62
+ version: 1.2.0
57
63
  - !ruby/object:Gem::Dependency
58
64
  name: bundler
59
65
  requirement: !ruby/object:Gem::Requirement
60
66
  requirements:
61
67
  - - "~>"
62
68
  - !ruby/object:Gem::Version
63
- version: '1.10'
69
+ version: '2.0'
64
70
  type: :development
65
71
  prerelease: false
66
72
  version_requirements: !ruby/object:Gem::Requirement
67
73
  requirements:
68
74
  - - "~>"
69
75
  - !ruby/object:Gem::Version
70
- version: '1.10'
76
+ version: '2.0'
71
77
  - !ruby/object:Gem::Dependency
72
78
  name: rake
73
79
  requirement: !ruby/object:Gem::Requirement
74
80
  requirements:
75
81
  - - "~>"
76
82
  - !ruby/object:Gem::Version
77
- version: '10.0'
83
+ version: '12.0'
78
84
  type: :development
79
85
  prerelease: false
80
86
  version_requirements: !ruby/object:Gem::Requirement
81
87
  requirements:
82
88
  - - "~>"
83
89
  - !ruby/object:Gem::Version
84
- version: '10.0'
90
+ version: '12.0'
85
91
  - !ruby/object:Gem::Dependency
86
92
  name: rspec
87
93
  requirement: !ruby/object:Gem::Requirement
@@ -137,7 +143,8 @@ files:
137
143
  - plaintext.gemspec
138
144
  - plaintext.yml.example
139
145
  homepage: https://github.com/planio-gmbh/plaintext
140
- licenses: []
146
+ licenses:
147
+ - GPL-2.0
141
148
  metadata: {}
142
149
  post_install_message:
143
150
  rdoc_options: []
@@ -154,8 +161,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
154
161
  - !ruby/object:Gem::Version
155
162
  version: '0'
156
163
  requirements: []
157
- rubyforge_project:
158
- rubygems_version: 2.4.5.5
164
+ rubygems_version: 3.0.3
159
165
  signing_key:
160
166
  specification_version: 4
161
167
  summary: Extract plain text from most common office documents.