plaintext 0.2.0 → 0.3.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: c04f6f06ab6a16b423c2a74ddb38685a49a23372
4
- data.tar.gz: 8dc49ce59bde01f25983b982323889a512fa900f
2
+ SHA256:
3
+ metadata.gz: fde4677879a9d3ce30844e9a5b0439ad2f708f2f532927f96b16d962bd4deb07
4
+ data.tar.gz: 900cf36c77f25e98876e9838f2e21a11244f37f059c6c272386fd8102cfd500d
5
5
  SHA512:
6
- metadata.gz: 13876311c960330abc0596ccc09a5eb172feca1101fa154386182bb04c7cfe6ca5f2b8fb4a408ffa658fbe0cbc616e29c8770c2a0680ec11064e950a75eae19d
7
- data.tar.gz: 50f4d40bcb4bfb257b4e280e081cea291a633203746c322c028e6dcacd53b68f0e5c50da6ede80bdeb0ae5a7c300c0ddb38694b4ee97d6ed8343ae81e5c8a69e
6
+ metadata.gz: c744870cc385445ae9a8ad68cfbc5e9489356f23750acdf7a0e8b54ec1ae9c674457c1f14e226aa8aee3bf542ba5fabeac524fda6ad0d89601414c0b77196e5a
7
+ data.tar.gz: e6c899c622b42856fd2671b0a7961a938e89389801904d70069b1b1735b8e6e374610f7f2b4d72fd08b16db1f24dcaa994e22b59f9e2a73585206f173b28a234
data/.travis.yml CHANGED
@@ -1,7 +1,7 @@
1
1
  language: ruby
2
2
  rvm:
3
- - 2.2.3
3
+ - 2.6.4
4
4
  before_install:
5
5
  - sudo apt-get -qq update
6
6
  - sudo apt-get install -y catdoc unrtf poppler-utils tesseract-ocr
7
- - gem install bundler -v 1.10.6
7
+ - gem install bundler -v 2.0.1
data/CHANGELOG CHANGED
@@ -6,9 +6,50 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
6
6
 
7
7
  ## [Unreleased]
8
8
 
9
+ ## [0.3.4] - 2021-04-21
10
+ - Further relax the rubyzip version requirement to allow 2.x versions
11
+ - Fix: Removed --quiet from unrtf arguments. In newer versions this seems to
12
+ suppress the header line which we used as a marker to strip unrtf comments and
13
+ meta data from the command output.
14
+
15
+ ## [0.3.3] - 2019-10-01
16
+ - Relax rubyzip dependency. Our usage of rubyzip does not require adressing
17
+ CVE-2019-16892 and thus is fine with 1.2, but applications using this Gem
18
+ might use RubyZIP elsewhere too and thus will want to upgrade to >= 1.3.
19
+
20
+ ## [0.3.2] - 2019-09-02
21
+ - Set minimum Nokogiri version to 1.10.4. See CVE-2019-5477.
22
+ - Fix encoding issues for PDFs.
23
+ - Bump development dependencies to bundler version 2 and
24
+ rake version 12
25
+ - Update travis file to use ruby 2.6.4 and bundler 2.0.1
26
+
27
+ ## [0.3.1] - 2019-01-16
28
+
29
+ ### Added
30
+ - The max_plaintext_bytes limit introduced in 0.3.0 is now also enforced in the
31
+ zipped XML handlers responsible for office document parsing.
32
+
33
+ ## [0.3.0] - 2019-01-09
34
+
35
+ ### Added
36
+ - `:max_plaintext_bytes` option to place an upper limit on the number of bytes
37
+ returned. Also limits the amount of data that is actually read from plain text
38
+ files and external command output to limit memory usage. Set this on the
39
+ resolver object.
40
+
41
+ ### Changed
42
+ - the unrtf handler now strips the preamble which unrtf adds to its output.
43
+ - text from external command handlers is now converted to UTF-8 before it's returned.
44
+ - the FileHandler `text` method signature has been changed by adding an options
45
+ hash to support passing the max_plaintext_bytes limit.
46
+
47
+
9
48
  ## [0.2.0] - 2018-12-22
49
+
10
50
  ### Changed
11
51
  - relaxed Nokogiri dependency to '~> 1.8' for Redmine 4 (which uses 1.9)
12
52
 
53
+
13
54
  ## [0.1.0] - 2018-02-15
14
55
  - Initial release
data/Gemfile CHANGED
@@ -1,4 +1,6 @@
1
1
  source 'https://rubygems.org'
2
2
 
3
+ gem "byebug"
4
+
3
5
  # Specify your gem's dependencies in plaintext.gemspec
4
- gemspec
6
+ gemspec
data/README.md CHANGED
@@ -109,6 +109,9 @@ catdoc:
109
109
  fulltext = Plaintext::Resolver.new(file, content_type).text
110
110
  ```
111
111
 
112
+ To limit the number of bytes returned (default is 4MB), set the
113
+ `max_plaintext_bytes` property on the resolver instance before calling `text`.
114
+
112
115
  ## License
113
116
 
114
117
  The `plaintext` gem is free software; you can redistribute it and/or modify it under the terms of the GNU General
@@ -11,5 +11,27 @@ module Plaintext
11
11
  false
12
12
  end
13
13
  end
14
+
15
+ # use `#set(max_size: 1.megabyte)` to give an upper limit of data to be read.
16
+ #
17
+ # By default, all data (whole file / command output) will be read which can
18
+ # be a problem with huge text files (eg SQL dumps)
19
+ def set(args = {})
20
+ options.update args
21
+ self
22
+ end
23
+
24
+ private
25
+
26
+ # maximum number of bytes to read from external command output or text
27
+ # files
28
+ def max_size
29
+ options[:max_size]
30
+ end
31
+
32
+ def options
33
+ @options ||= {}
34
+ end
35
+
14
36
  end
15
- end
37
+ end
@@ -11,23 +11,26 @@ module Plaintext
11
11
  # Due to how popen works the command will be executed directly without
12
12
  # involving the shell if cmd is an array.
13
13
  require 'fileutils'
14
+
15
+ FILE_PLACEHOLDER = '__FILE__'.freeze
16
+ DEFAULT_STREAM_ENCODING = 'ASCII-8BIT'.freeze
17
+
14
18
  def shellout(cmd, options = {}, &block)
15
19
  mode = "r+"
16
20
  IO.popen(cmd, mode) do |io|
17
- io.set_encoding("ASCII-8BIT") if io.respond_to?(:set_encoding)
21
+ set_stream_encoding(io)
18
22
  io.close_write unless options[:write_stdin]
19
23
  block.call(io) if block_given?
20
24
  end
21
25
  end
22
26
 
23
- FILE_PLACEHOLDER = '__FILE__'.freeze
24
-
25
- def text(file)
27
+ def text(file, options = {})
26
28
  cmd = @command.dup
27
29
  cmd[cmd.index(FILE_PLACEHOLDER)] = Pathname(file).to_s
28
- shellout(cmd){ |io| io.read }.to_s
30
+ shellout(cmd) { |io| read io, options[:max_size] }.to_s
29
31
  end
30
32
 
33
+
31
34
  def accept?(content_type)
32
35
  super and available?
33
36
  end
@@ -39,5 +42,33 @@ module Plaintext
39
42
  def self.available?
40
43
  new.available?
41
44
  end
45
+
46
+ protected
47
+
48
+ def utf8_stream?
49
+ false
50
+ end
51
+
52
+ private
53
+
54
+ def set_stream_encoding(io)
55
+ return unless io.respond_to?(:set_encoding)
56
+
57
+ if utf8_stream?
58
+ io.set_encoding('UTF-8'.freeze)
59
+ else
60
+ io.set_encoding(DEFAULT_STREAM_ENCODING)
61
+ end
62
+ end
63
+
64
+ def read(io, max_size = nil)
65
+ piece = io.read(max_size)
66
+
67
+ if utf8_stream?
68
+ piece
69
+ else
70
+ Plaintext::CodesetUtil.to_utf8 piece, DEFAULT_STREAM_ENCODING
71
+ end
72
+ end
42
73
  end
43
74
  end
@@ -5,9 +5,16 @@ module Plaintext
5
5
  DEFAULT = [
6
6
  '/usr/bin/pdftotext', '-enc', 'UTF-8', '__FILE__', '-'
7
7
  ].freeze
8
+
8
9
  def initialize
9
10
  @content_type = 'application/pdf'
10
11
  @command = Plaintext::Configuration['pdftotext'] || DEFAULT
11
12
  end
13
+
14
+ protected
15
+
16
+ def utf8_stream?
17
+ true
18
+ end
12
19
  end
13
20
  end
@@ -3,11 +3,34 @@
3
3
  module Plaintext
4
4
  class RtfHandler < ExternalCommandHandler
5
5
  DEFAULT = [
6
- '/usr/bin/unrtf', '--text', '__FILE__'
6
+ '/usr/bin/unrtf', '--nopict', '--text', '__FILE__'
7
7
  ].freeze
8
8
  def initialize
9
9
  @content_type = 'application/rtf'
10
10
  @command = Plaintext::Configuration['unrtf'] || DEFAULT
11
11
  end
12
+
13
+ private
14
+
15
+ UNRTF_HEADER = "### Translation from RTF performed by UnRTF"
16
+ END_MARKER = "-----------------\n"
17
+
18
+ def read(io, max_size = nil)
19
+ if line = io.read(UNRTF_HEADER.length)
20
+ string = if line.starts_with? UNRTF_HEADER
21
+ io.gets while $_ != END_MARKER
22
+ io.read max_size
23
+ else
24
+ if max_size.nil?
25
+ line + io.read
26
+ elsif max_size > UNRTF_HEADER.length
27
+ line + io.read(max_size - UNRTF_HEADER.length)
28
+ else
29
+ line[0,max_size]
30
+ end
31
+ end
32
+ Plaintext::CodesetUtil.to_utf8 string, "ASCII-8BIT"
33
+ end
34
+ end
12
35
  end
13
36
  end
@@ -1,14 +1,16 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Plaintext
4
+
4
5
  class PlaintextHandler < FileHandler
5
6
  CONTENT_TYPES = %w(text/csv text/plain)
6
7
  def initialize
7
8
  @content_types = CONTENT_TYPES
8
9
  end
9
10
 
10
- def text(file)
11
- Plaintext::CodesetUtil.to_utf8 IO.read(file), 'UTF-8'
11
+ def text(file, options = {})
12
+ max_size = options[:max_size]
13
+ Plaintext::CodesetUtil.to_utf8 IO.read(file, max_size), 'UTF-8'
12
14
  end
13
15
  end
14
- end
16
+ end
@@ -9,16 +9,26 @@ module Plaintext
9
9
  class SaxDocument < Nokogiri::XML::SAX::Document
10
10
  attr_reader :text
11
11
 
12
- def initialize(text_element, text_namespace)
12
+ def initialize(text_element, text_namespace, max_size = nil)
13
13
  @element = text_element
14
14
  @namespace_uri = text_namespace
15
+ @max_size = max_size
16
+
15
17
  @text = ''.dup
16
18
  @is_text = false
17
19
  end
18
20
 
21
+ def text_length_exceeded?
22
+ @max_size && (@text.length > @max_size)
23
+ end
24
+
25
+
19
26
  # Handle each element, expecting the name and any attributes
20
27
  def start_element_namespace(name, attrs = [], prefix = nil, uri = nil, ns = [])
21
- if name == @element and uri == @namespace_uri
28
+ if name == @element and
29
+ uri == @namespace_uri and
30
+ !text_length_exceeded?
31
+
22
32
  @is_text = true
23
33
  end
24
34
  end
@@ -30,18 +40,22 @@ module Plaintext
30
40
 
31
41
  # Given the name of an element once its closing tag is reached
32
42
  def end_element_namespace(name, prefix = nil, uri = nil)
33
- if name == @element and uri == @namespace_uri
43
+ if name == @element and
44
+ uri == @namespace_uri and
45
+ @is_text
46
+
34
47
  @text << ' '
35
48
  @is_text = false
36
49
  end
37
50
  end
38
51
  end
39
52
 
40
- def text(file)
53
+ def text(file, options = {})
54
+ max_size = options[:max_size]
41
55
  Zip::File.open(file) do |zip_file|
42
56
  zip_file.each do |entry|
43
57
  if entry.name == @file_name
44
- return xml_to_text entry.get_input_stream
58
+ return xml_to_text entry.get_input_stream, max_size
45
59
  end
46
60
  end
47
61
  end
@@ -49,10 +63,11 @@ module Plaintext
49
63
 
50
64
  private
51
65
 
52
- def xml_to_text(io)
53
- sax_doc = SaxDocument.new @element, @namespace_uri
66
+ def xml_to_text(io, max_size)
67
+ sax_doc = SaxDocument.new @element, @namespace_uri, max_size
54
68
  Nokogiri::XML::SAX::Parser.new(sax_doc).parse(io)
55
- sax_doc.text
69
+ text = sax_doc.text
70
+ max_size.present? ? text[0, max_size] : text
56
71
  end
57
72
  end
58
- end
73
+ end
@@ -14,17 +14,24 @@ module Plaintext
14
14
  @namespace_uri = 'http://schemas.openxmlformats.org/drawingml/2006/main'
15
15
  end
16
16
 
17
- def text(file)
17
+ def text(file, options = {})
18
+ max_size = options[:max_size]
18
19
  slides = []
20
+ result = ''.dup
19
21
  Zip::File.open(file) do |zip_file|
20
22
  zip_file.each do |entry|
21
23
  if entry.name =~ /slide(\d+)\.xml/
22
- slides << [$1, xml_to_text(entry.get_input_stream)]
24
+ slides << [$1, entry]
23
25
  end
24
26
  end
27
+
28
+ slides.sort!{|a, b| a.first <=> b.first}
29
+ slides.each do |id, entry|
30
+ result << xml_to_text(entry.get_input_stream, max_size)
31
+ break if max_size and result.length >= max_size
32
+ end
25
33
  end
26
- slides.sort!{|a, b| a.first <=> b.first}
27
- slides.map(&:last).join ' '
34
+ return result
28
35
  end
29
36
  end
30
- end
37
+ end
@@ -2,7 +2,9 @@
2
2
 
3
3
  module Plaintext
4
4
  class Resolver
5
- MAX_FULLTEXT_LENGTH = 4_194_304 # 4 megabytes
5
+
6
+ # maximum length of returned plain text in bytes. Default: 4MB
7
+ attr_accessor :max_plaintext_bytes
6
8
 
7
9
  class << self
8
10
  attr_accessor :cached_file_handlers
@@ -26,15 +28,19 @@ module Plaintext
26
28
  def initialize(file, content_type = nil)
27
29
  @file = file
28
30
  @content_type = content_type
31
+ @max_plaintext_bytes = 4_194_304 # 4 megabytes
29
32
  end
30
33
 
34
+
31
35
  # Returns the extracted fulltext or nil if no matching handler was found
32
36
  # for the file type.
33
37
  def text
34
- if handler = find_handler and text = handler.text(@file)
35
- text.gsub! /\s+/m, ' '
38
+ if handler = find_handler and
39
+ text = handler.text(@file, max_size: max_plaintext_bytes)
40
+
41
+ text.gsub!(/\s+/m, ' ')
36
42
  text.strip!
37
- text.mb_chars.compose.limit(MAX_FULLTEXT_LENGTH).to_s
43
+ text.mb_chars.compose.limit(max_plaintext_bytes).to_s
38
44
  end
39
45
  end
40
46
 
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Plaintext
4
- VERSION = "0.2.0"
4
+ VERSION = "0.3.4"
5
5
  end
data/plaintext.gemspec CHANGED
@@ -4,7 +4,7 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
4
  require 'plaintext/version'
5
5
 
6
6
  Gem::Specification.new do |spec|
7
- spec.name = "plaintext"
7
+ spec.name = 'plaintext'
8
8
  spec.version = Plaintext::VERSION
9
9
  spec.authors = ['Jens Krämer', 'Planio GmbH', 'OpenProject GmbH']
10
10
  spec.email = ['info@openproject.com']
@@ -12,17 +12,18 @@ Gem::Specification.new do |spec|
12
12
  spec.summary = 'Extract plain text from most common office documents.'
13
13
  spec.description = "Extract text from common office files. Based on the file's content type a command line tool is selected to do the job."
14
14
  spec.homepage = 'https://github.com/planio-gmbh/plaintext'
15
-
15
+ spec.license = 'GPL-2.0'
16
+
16
17
  spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
17
- spec.bindir = "exe"
18
+ spec.bindir = 'exe'
18
19
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
19
- spec.require_paths = ["lib"]
20
+ spec.require_paths = ['lib']
20
21
 
21
- spec.add_dependency 'rubyzip', '~> 1.2.1'
22
- spec.add_dependency 'nokogiri', '~> 1.8'
23
22
  spec.add_dependency 'activesupport', '>2.2.1 '
23
+ spec.add_dependency 'nokogiri', '~> 1.10', '>= 1.10.4'
24
+ spec.add_dependency 'rubyzip', '>= 1.2.0'
24
25
 
25
- spec.add_development_dependency "bundler", "~> 1.10"
26
- spec.add_development_dependency "rake", "~> 10.0"
27
- spec.add_development_dependency "rspec"
26
+ spec.add_development_dependency 'bundler', '~> 2.0'
27
+ spec.add_development_dependency 'rake', '~> 12.0'
28
+ spec.add_development_dependency 'rspec'
28
29
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: plaintext
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jens Krämer
@@ -10,78 +10,84 @@ authors:
10
10
  autorequire:
11
11
  bindir: exe
12
12
  cert_chain: []
13
- date: 2018-12-22 00:00:00.000000000 Z
13
+ date: 2021-04-21 00:00:00.000000000 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
- name: rubyzip
16
+ name: activesupport
17
17
  requirement: !ruby/object:Gem::Requirement
18
18
  requirements:
19
- - - "~>"
19
+ - - ">"
20
20
  - !ruby/object:Gem::Version
21
- version: 1.2.1
21
+ version: 2.2.1
22
22
  type: :runtime
23
23
  prerelease: false
24
24
  version_requirements: !ruby/object:Gem::Requirement
25
25
  requirements:
26
- - - "~>"
26
+ - - ">"
27
27
  - !ruby/object:Gem::Version
28
- version: 1.2.1
28
+ version: 2.2.1
29
29
  - !ruby/object:Gem::Dependency
30
30
  name: nokogiri
31
31
  requirement: !ruby/object:Gem::Requirement
32
32
  requirements:
33
33
  - - "~>"
34
34
  - !ruby/object:Gem::Version
35
- version: '1.8'
35
+ version: '1.10'
36
+ - - ">="
37
+ - !ruby/object:Gem::Version
38
+ version: 1.10.4
36
39
  type: :runtime
37
40
  prerelease: false
38
41
  version_requirements: !ruby/object:Gem::Requirement
39
42
  requirements:
40
43
  - - "~>"
41
44
  - !ruby/object:Gem::Version
42
- version: '1.8'
45
+ version: '1.10'
46
+ - - ">="
47
+ - !ruby/object:Gem::Version
48
+ version: 1.10.4
43
49
  - !ruby/object:Gem::Dependency
44
- name: activesupport
50
+ name: rubyzip
45
51
  requirement: !ruby/object:Gem::Requirement
46
52
  requirements:
47
- - - ">"
53
+ - - ">="
48
54
  - !ruby/object:Gem::Version
49
- version: 2.2.1
55
+ version: 1.2.0
50
56
  type: :runtime
51
57
  prerelease: false
52
58
  version_requirements: !ruby/object:Gem::Requirement
53
59
  requirements:
54
- - - ">"
60
+ - - ">="
55
61
  - !ruby/object:Gem::Version
56
- version: 2.2.1
62
+ version: 1.2.0
57
63
  - !ruby/object:Gem::Dependency
58
64
  name: bundler
59
65
  requirement: !ruby/object:Gem::Requirement
60
66
  requirements:
61
67
  - - "~>"
62
68
  - !ruby/object:Gem::Version
63
- version: '1.10'
69
+ version: '2.0'
64
70
  type: :development
65
71
  prerelease: false
66
72
  version_requirements: !ruby/object:Gem::Requirement
67
73
  requirements:
68
74
  - - "~>"
69
75
  - !ruby/object:Gem::Version
70
- version: '1.10'
76
+ version: '2.0'
71
77
  - !ruby/object:Gem::Dependency
72
78
  name: rake
73
79
  requirement: !ruby/object:Gem::Requirement
74
80
  requirements:
75
81
  - - "~>"
76
82
  - !ruby/object:Gem::Version
77
- version: '10.0'
83
+ version: '12.0'
78
84
  type: :development
79
85
  prerelease: false
80
86
  version_requirements: !ruby/object:Gem::Requirement
81
87
  requirements:
82
88
  - - "~>"
83
89
  - !ruby/object:Gem::Version
84
- version: '10.0'
90
+ version: '12.0'
85
91
  - !ruby/object:Gem::Dependency
86
92
  name: rspec
87
93
  requirement: !ruby/object:Gem::Requirement
@@ -137,7 +143,8 @@ files:
137
143
  - plaintext.gemspec
138
144
  - plaintext.yml.example
139
145
  homepage: https://github.com/planio-gmbh/plaintext
140
- licenses: []
146
+ licenses:
147
+ - GPL-2.0
141
148
  metadata: {}
142
149
  post_install_message:
143
150
  rdoc_options: []
@@ -154,8 +161,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
154
161
  - !ruby/object:Gem::Version
155
162
  version: '0'
156
163
  requirements: []
157
- rubyforge_project:
158
- rubygems_version: 2.4.5.5
164
+ rubygems_version: 3.0.3
159
165
  signing_key:
160
166
  specification_version: 4
161
167
  summary: Extract plain text from most common office documents.