plaintext 0.2.0 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c04f6f06ab6a16b423c2a74ddb38685a49a23372
4
- data.tar.gz: 8dc49ce59bde01f25983b982323889a512fa900f
3
+ metadata.gz: eb87c916a0f34e20251b5a97cbcf1e354fc046b0
4
+ data.tar.gz: e6dff85b7b0cec1ae3fdebbaabb950be1b563215
5
5
  SHA512:
6
- metadata.gz: 13876311c960330abc0596ccc09a5eb172feca1101fa154386182bb04c7cfe6ca5f2b8fb4a408ffa658fbe0cbc616e29c8770c2a0680ec11064e950a75eae19d
7
- data.tar.gz: 50f4d40bcb4bfb257b4e280e081cea291a633203746c322c028e6dcacd53b68f0e5c50da6ede80bdeb0ae5a7c300c0ddb38694b4ee97d6ed8343ae81e5c8a69e
6
+ metadata.gz: 48be94f65a07c3e400bacfae575758494afbf3d6004b19a99f7838a032b26d4503ae3ea47b46548de0dd34b4892c551d5d7af3620a72375ae9a771afc0601cc2
7
+ data.tar.gz: 0b5dc9aa9abae205083d3dd4dcc89572bf36361fee098c6145472735ac6d10c810b07595be095565baa18b99534b61bbbc63b4e26c445ccdac94a3468b6c962c
data/CHANGELOG CHANGED
@@ -6,9 +6,26 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
6
6
 
7
7
  ## [Unreleased]
8
8
 
9
+ ## [0.3.0] - 2019-01-09
10
+
11
+ ### Added
12
+ - `:max_plaintext_bytes` option to place an upper limit on the number of bytes
13
+ returned. Also limits the amount of data that is actually read from plain text
14
+ files and external command output to limit memory usage. Set this on the
15
+ resolver object.
16
+
17
+ ### Changed
18
+ - the unrtf handler now strips the preamble which unrtf adds to its output.
19
+ - text from external command handlers is now converted to UTF-8 before it's returned.
20
+ - the FileHandler `text` method signature has been changed by adding an options
21
+ hash to support passing the max_plaintext_bytes limit.
22
+
23
+
9
24
  ## [0.2.0] - 2018-12-22
25
+
10
26
  ### Changed
11
27
  - relaxed Nokogiri dependency to '~> 1.8' for Redmine 4 (which uses 1.9)
12
28
 
29
+
13
30
  ## [0.1.0] - 2018-02-15
14
31
  - Initial release
data/Gemfile CHANGED
@@ -1,4 +1,6 @@
1
1
  source 'https://rubygems.org'
2
2
 
3
+ gem "byebug"
4
+
3
5
  # Specify your gem's dependencies in plaintext.gemspec
4
- gemspec
6
+ gemspec
data/README.md CHANGED
@@ -109,6 +109,9 @@ catdoc:
109
109
  fulltext = Plaintext::Resolver.new(file, content_type).text
110
110
  ```
111
111
 
112
+ To limit the number of bytes returned (default is 4MB), set the
113
+ `max_plaintext_bytes` property on the resolver instance before calling `text`.
114
+
112
115
  ## License
113
116
 
114
117
  The `plaintext` gem is free software; you can redistribute it and/or modify it under the terms of the GNU General
@@ -11,5 +11,27 @@ module Plaintext
11
11
  false
12
12
  end
13
13
  end
14
+
15
+ # use `#set(max_size: 1.megabyte)` to give an upper limit of data to be read.
16
+ #
17
+ # By default, all data (whole file / command output) will be read which can
18
+ # be a problem with huge text files (eg SQL dumps)
19
+ def set(args = {})
20
+ options.update args
21
+ self
22
+ end
23
+
24
+ private
25
+
26
+ # maximum number of bytes to read from external command output or text
27
+ # files
28
+ def max_size
29
+ options[:max_size]
30
+ end
31
+
32
+ def options
33
+ @options ||= {}
34
+ end
35
+
14
36
  end
15
- end
37
+ end
@@ -22,12 +22,13 @@ module Plaintext
22
22
 
23
23
  FILE_PLACEHOLDER = '__FILE__'.freeze
24
24
 
25
- def text(file)
25
+ def text(file, options = {})
26
26
  cmd = @command.dup
27
27
  cmd[cmd.index(FILE_PLACEHOLDER)] = Pathname(file).to_s
28
- shellout(cmd){ |io| io.read }.to_s
28
+ shellout(cmd){ |io| read io, options[:max_size] }.to_s
29
29
  end
30
30
 
31
+
31
32
  def accept?(content_type)
32
33
  super and available?
33
34
  end
@@ -39,5 +40,11 @@ module Plaintext
39
40
  def self.available?
40
41
  new.available?
41
42
  end
43
+
44
+ private
45
+
46
+ def read(io, max_size = nil)
47
+ Plaintext::CodesetUtil.to_utf8 io.read(max_size), "ASCII-8BIT"
48
+ end
42
49
  end
43
50
  end
@@ -3,11 +3,34 @@
3
3
  module Plaintext
4
4
  class RtfHandler < ExternalCommandHandler
5
5
  DEFAULT = [
6
- '/usr/bin/unrtf', '--text', '__FILE__'
6
+ '/usr/bin/unrtf', '--nopict', '--quiet', '--text', '__FILE__'
7
7
  ].freeze
8
8
  def initialize
9
9
  @content_type = 'application/rtf'
10
10
  @command = Plaintext::Configuration['unrtf'] || DEFAULT
11
11
  end
12
+
13
+ private
14
+
15
+ UNRTF_HEADER = "### Translation from RTF performed by UnRTF"
16
+ END_MARKER = "-----------------\n"
17
+
18
+ def read(io, max_size = nil)
19
+ if line = io.read(UNRTF_HEADER.length)
20
+ string = if line.starts_with? UNRTF_HEADER
21
+ io.gets while $_ != END_MARKER
22
+ io.read max_size
23
+ else
24
+ if max_size.nil?
25
+ line + io.read
26
+ elsif max_size > UNRTF_HEADER.length
27
+ line + io.read(max_size - UNRTF_HEADER.length)
28
+ else
29
+ line[0,max_size]
30
+ end
31
+ end
32
+ Plaintext::CodesetUtil.to_utf8 string, "ASCII-8BIT"
33
+ end
34
+ end
12
35
  end
13
36
  end
@@ -1,14 +1,16 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Plaintext
4
+
4
5
  class PlaintextHandler < FileHandler
5
6
  CONTENT_TYPES = %w(text/csv text/plain)
6
7
  def initialize
7
8
  @content_types = CONTENT_TYPES
8
9
  end
9
10
 
10
- def text(file)
11
- Plaintext::CodesetUtil.to_utf8 IO.read(file), 'UTF-8'
11
+ def text(file, options = {})
12
+ max_size = options[:max_size]
13
+ Plaintext::CodesetUtil.to_utf8 IO.read(file, max_size), 'UTF-8'
12
14
  end
13
15
  end
14
- end
16
+ end
@@ -37,7 +37,7 @@ module Plaintext
37
37
  end
38
38
  end
39
39
 
40
- def text(file)
40
+ def text(file, options = {})
41
41
  Zip::File.open(file) do |zip_file|
42
42
  zip_file.each do |entry|
43
43
  if entry.name == @file_name
@@ -55,4 +55,4 @@ module Plaintext
55
55
  sax_doc.text
56
56
  end
57
57
  end
58
- end
58
+ end
@@ -14,7 +14,7 @@ module Plaintext
14
14
  @namespace_uri = 'http://schemas.openxmlformats.org/drawingml/2006/main'
15
15
  end
16
16
 
17
- def text(file)
17
+ def text(file, options = {})
18
18
  slides = []
19
19
  Zip::File.open(file) do |zip_file|
20
20
  zip_file.each do |entry|
@@ -27,4 +27,4 @@ module Plaintext
27
27
  slides.map(&:last).join ' '
28
28
  end
29
29
  end
30
- end
30
+ end
@@ -2,7 +2,9 @@
2
2
 
3
3
  module Plaintext
4
4
  class Resolver
5
- MAX_FULLTEXT_LENGTH = 4_194_304 # 4 megabytes
5
+
6
+ # maximum length of returned plain text in bytes. Default: 4MB
7
+ attr_accessor :max_plaintext_bytes
6
8
 
7
9
  class << self
8
10
  attr_accessor :cached_file_handlers
@@ -26,15 +28,19 @@ module Plaintext
26
28
  def initialize(file, content_type = nil)
27
29
  @file = file
28
30
  @content_type = content_type
31
+ @max_plaintext_bytes = 4_194_304 # 4 megabytes
29
32
  end
30
33
 
34
+
31
35
  # Returns the extracted fulltext or nil if no matching handler was found
32
36
  # for the file type.
33
37
  def text
34
- if handler = find_handler and text = handler.text(@file)
35
- text.gsub! /\s+/m, ' '
38
+ if handler = find_handler and
39
+ text = handler.text(@file, max_size: max_plaintext_bytes)
40
+
41
+ text.gsub!(/\s+/m, ' ')
36
42
  text.strip!
37
- text.mb_chars.compose.limit(MAX_FULLTEXT_LENGTH).to_s
43
+ text.mb_chars.compose.limit(max_plaintext_bytes).to_s
38
44
  end
39
45
  end
40
46
 
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Plaintext
4
- VERSION = "0.2.0"
4
+ VERSION = "0.3.0"
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: plaintext
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jens Krämer
@@ -10,7 +10,7 @@ authors:
10
10
  autorequire:
11
11
  bindir: exe
12
12
  cert_chain: []
13
- date: 2018-12-22 00:00:00.000000000 Z
13
+ date: 2019-01-09 00:00:00.000000000 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: rubyzip
@@ -155,7 +155,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
155
155
  version: '0'
156
156
  requirements: []
157
157
  rubyforge_project:
158
- rubygems_version: 2.4.5.5
158
+ rubygems_version: 2.5.2.1
159
159
  signing_key:
160
160
  specification_version: 4
161
161
  summary: Extract plain text from most common office documents.