plaintext 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c04f6f06ab6a16b423c2a74ddb38685a49a23372
4
- data.tar.gz: 8dc49ce59bde01f25983b982323889a512fa900f
3
+ metadata.gz: eb87c916a0f34e20251b5a97cbcf1e354fc046b0
4
+ data.tar.gz: e6dff85b7b0cec1ae3fdebbaabb950be1b563215
5
5
  SHA512:
6
- metadata.gz: 13876311c960330abc0596ccc09a5eb172feca1101fa154386182bb04c7cfe6ca5f2b8fb4a408ffa658fbe0cbc616e29c8770c2a0680ec11064e950a75eae19d
7
- data.tar.gz: 50f4d40bcb4bfb257b4e280e081cea291a633203746c322c028e6dcacd53b68f0e5c50da6ede80bdeb0ae5a7c300c0ddb38694b4ee97d6ed8343ae81e5c8a69e
6
+ metadata.gz: 48be94f65a07c3e400bacfae575758494afbf3d6004b19a99f7838a032b26d4503ae3ea47b46548de0dd34b4892c551d5d7af3620a72375ae9a771afc0601cc2
7
+ data.tar.gz: 0b5dc9aa9abae205083d3dd4dcc89572bf36361fee098c6145472735ac6d10c810b07595be095565baa18b99534b61bbbc63b4e26c445ccdac94a3468b6c962c
data/CHANGELOG CHANGED
@@ -6,9 +6,26 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
6
6
 
7
7
  ## [Unreleased]
8
8
 
9
+ ## [0.3.0] - 2019-01-09
10
+
11
+ ### Added
12
+ - `:max_plaintext_bytes` option to place an upper limit on the number of bytes
13
+ returned. Also limits the amount of data that is actually read from plain text
14
+ files and external command output to limit memory usage. Set this on the
15
+ resolver object.
16
+
17
+ ### Changed
18
+ - the unrtf handler now strips the preamble which unrtf adds to its output.
19
+ - text from external command handlers is now converted to UTF-8 before it's returned.
20
+ - the FileHandler `text` method signature has been changed by adding an options
21
+ hash to support passing the max_plaintext_bytes limit.
22
+
23
+
9
24
  ## [0.2.0] - 2018-12-22
25
+
10
26
  ### Changed
11
27
  - relaxed Nokogiri dependency to '~> 1.8' for Redmine 4 (which uses 1.9)
12
28
 
29
+
13
30
  ## [0.1.0] - 2018-02-15
14
31
  - Initial release
data/Gemfile CHANGED
@@ -1,4 +1,6 @@
1
1
  source 'https://rubygems.org'
2
2
 
3
+ gem "byebug"
4
+
3
5
  # Specify your gem's dependencies in plaintext.gemspec
4
- gemspec
6
+ gemspec
data/README.md CHANGED
@@ -109,6 +109,9 @@ catdoc:
109
109
  fulltext = Plaintext::Resolver.new(file, content_type).text
110
110
  ```
111
111
 
112
+ To limit the number of bytes returned (default is 4MB), set the
113
+ `max_plaintext_bytes` property on the resolver instance before calling `text`.
114
+
112
115
  ## License
113
116
 
114
117
  The `plaintext` gem is free software; you can redistribute it and/or modify it under the terms of the GNU General
@@ -11,5 +11,27 @@ module Plaintext
11
11
  false
12
12
  end
13
13
  end
14
+
15
+ # use `#set(max_size: 1.megabyte)` to give an upper limit of data to be read.
16
+ #
17
+ # By default, all data (whole file / command output) will be read which can
18
+ # be a problem with huge text files (eg SQL dumps)
19
+ def set(args = {})
20
+ options.update args
21
+ self
22
+ end
23
+
24
+ private
25
+
26
+ # maximum number of bytes to read from external command output or text
27
+ # files
28
+ def max_size
29
+ options[:max_size]
30
+ end
31
+
32
+ def options
33
+ @options ||= {}
34
+ end
35
+
14
36
  end
15
- end
37
+ end
@@ -22,12 +22,13 @@ module Plaintext
22
22
 
23
23
  FILE_PLACEHOLDER = '__FILE__'.freeze
24
24
 
25
- def text(file)
25
+ def text(file, options = {})
26
26
  cmd = @command.dup
27
27
  cmd[cmd.index(FILE_PLACEHOLDER)] = Pathname(file).to_s
28
- shellout(cmd){ |io| io.read }.to_s
28
+ shellout(cmd){ |io| read io, options[:max_size] }.to_s
29
29
  end
30
30
 
31
+
31
32
  def accept?(content_type)
32
33
  super and available?
33
34
  end
@@ -39,5 +40,11 @@ module Plaintext
39
40
  def self.available?
40
41
  new.available?
41
42
  end
43
+
44
+ private
45
+
46
+ def read(io, max_size = nil)
47
+ Plaintext::CodesetUtil.to_utf8 io.read(max_size), "ASCII-8BIT"
48
+ end
42
49
  end
43
50
  end
@@ -3,11 +3,34 @@
3
3
  module Plaintext
4
4
  class RtfHandler < ExternalCommandHandler
5
5
  DEFAULT = [
6
- '/usr/bin/unrtf', '--text', '__FILE__'
6
+ '/usr/bin/unrtf', '--nopict', '--quiet', '--text', '__FILE__'
7
7
  ].freeze
8
8
  def initialize
9
9
  @content_type = 'application/rtf'
10
10
  @command = Plaintext::Configuration['unrtf'] || DEFAULT
11
11
  end
12
+
13
+ private
14
+
15
+ UNRTF_HEADER = "### Translation from RTF performed by UnRTF"
16
+ END_MARKER = "-----------------\n"
17
+
18
+ def read(io, max_size = nil)
19
+ if line = io.read(UNRTF_HEADER.length)
20
+ string = if line.starts_with? UNRTF_HEADER
21
+ io.gets while $_ != END_MARKER
22
+ io.read max_size
23
+ else
24
+ if max_size.nil?
25
+ line + io.read
26
+ elsif max_size > UNRTF_HEADER.length
27
+ line + io.read(max_size - UNRTF_HEADER.length)
28
+ else
29
+ line[0,max_size]
30
+ end
31
+ end
32
+ Plaintext::CodesetUtil.to_utf8 string, "ASCII-8BIT"
33
+ end
34
+ end
12
35
  end
13
36
  end
@@ -1,14 +1,16 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Plaintext
4
+
4
5
  class PlaintextHandler < FileHandler
5
6
  CONTENT_TYPES = %w(text/csv text/plain)
6
7
  def initialize
7
8
  @content_types = CONTENT_TYPES
8
9
  end
9
10
 
10
- def text(file)
11
- Plaintext::CodesetUtil.to_utf8 IO.read(file), 'UTF-8'
11
+ def text(file, options = {})
12
+ max_size = options[:max_size]
13
+ Plaintext::CodesetUtil.to_utf8 IO.read(file, max_size), 'UTF-8'
12
14
  end
13
15
  end
14
- end
16
+ end
@@ -37,7 +37,7 @@ module Plaintext
37
37
  end
38
38
  end
39
39
 
40
- def text(file)
40
+ def text(file, options = {})
41
41
  Zip::File.open(file) do |zip_file|
42
42
  zip_file.each do |entry|
43
43
  if entry.name == @file_name
@@ -55,4 +55,4 @@ module Plaintext
55
55
  sax_doc.text
56
56
  end
57
57
  end
58
- end
58
+ end
@@ -14,7 +14,7 @@ module Plaintext
14
14
  @namespace_uri = 'http://schemas.openxmlformats.org/drawingml/2006/main'
15
15
  end
16
16
 
17
- def text(file)
17
+ def text(file, options = {})
18
18
  slides = []
19
19
  Zip::File.open(file) do |zip_file|
20
20
  zip_file.each do |entry|
@@ -27,4 +27,4 @@ module Plaintext
27
27
  slides.map(&:last).join ' '
28
28
  end
29
29
  end
30
- end
30
+ end
@@ -2,7 +2,9 @@
2
2
 
3
3
  module Plaintext
4
4
  class Resolver
5
- MAX_FULLTEXT_LENGTH = 4_194_304 # 4 megabytes
5
+
6
+ # maximum length of returned plain text in bytes. Default: 4MB
7
+ attr_accessor :max_plaintext_bytes
6
8
 
7
9
  class << self
8
10
  attr_accessor :cached_file_handlers
@@ -26,15 +28,19 @@ module Plaintext
26
28
  def initialize(file, content_type = nil)
27
29
  @file = file
28
30
  @content_type = content_type
31
+ @max_plaintext_bytes = 4_194_304 # 4 megabytes
29
32
  end
30
33
 
34
+
31
35
  # Returns the extracted fulltext or nil if no matching handler was found
32
36
  # for the file type.
33
37
  def text
34
- if handler = find_handler and text = handler.text(@file)
35
- text.gsub! /\s+/m, ' '
38
+ if handler = find_handler and
39
+ text = handler.text(@file, max_size: max_plaintext_bytes)
40
+
41
+ text.gsub!(/\s+/m, ' ')
36
42
  text.strip!
37
- text.mb_chars.compose.limit(MAX_FULLTEXT_LENGTH).to_s
43
+ text.mb_chars.compose.limit(max_plaintext_bytes).to_s
38
44
  end
39
45
  end
40
46
 
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Plaintext
4
- VERSION = "0.2.0"
4
+ VERSION = "0.3.0"
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: plaintext
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jens Krämer
@@ -10,7 +10,7 @@ authors:
10
10
  autorequire:
11
11
  bindir: exe
12
12
  cert_chain: []
13
- date: 2018-12-22 00:00:00.000000000 Z
13
+ date: 2019-01-09 00:00:00.000000000 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: rubyzip
@@ -155,7 +155,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
155
155
  version: '0'
156
156
  requirements: []
157
157
  rubyforge_project:
158
- rubygems_version: 2.4.5.5
158
+ rubygems_version: 2.5.2.1
159
159
  signing_key:
160
160
  specification_version: 4
161
161
  summary: Extract plain text from most common office documents.