plaintext 0.2.0 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG +17 -0
- data/Gemfile +3 -1
- data/README.md +3 -0
- data/lib/plaintext/file_handler.rb +23 -1
- data/lib/plaintext/file_handler/external_command_handler.rb +9 -2
- data/lib/plaintext/file_handler/external_command_handler/rtf_handler.rb +24 -1
- data/lib/plaintext/file_handler/plaintext_handler.rb +5 -3
- data/lib/plaintext/file_handler/zipped_xml_handler.rb +2 -2
- data/lib/plaintext/file_handler/zipped_xml_handler/office_document_handler/pptx_handler.rb +2 -2
- data/lib/plaintext/resolver.rb +10 -4
- data/lib/plaintext/version.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: eb87c916a0f34e20251b5a97cbcf1e354fc046b0
|
4
|
+
data.tar.gz: e6dff85b7b0cec1ae3fdebbaabb950be1b563215
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 48be94f65a07c3e400bacfae575758494afbf3d6004b19a99f7838a032b26d4503ae3ea47b46548de0dd34b4892c551d5d7af3620a72375ae9a771afc0601cc2
|
7
|
+
data.tar.gz: 0b5dc9aa9abae205083d3dd4dcc89572bf36361fee098c6145472735ac6d10c810b07595be095565baa18b99534b61bbbc63b4e26c445ccdac94a3468b6c962c
|
data/CHANGELOG
CHANGED
@@ -6,9 +6,26 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
6
6
|
|
7
7
|
## [Unreleased]
|
8
8
|
|
9
|
+
## [0.3.0] - 2019-01-09
|
10
|
+
|
11
|
+
### Added
|
12
|
+
- `:max_plaintext_bytes` option to place an upper limit on the number of bytes
|
13
|
+
returned. Also limits the amount of data that is actually read from plain text
|
14
|
+
files and external command output to limit memory usage. Set this on the
|
15
|
+
resolver object.
|
16
|
+
|
17
|
+
### Changed
|
18
|
+
- the unrtf handler now strips the preamble which unrtf adds to its output.
|
19
|
+
- text from external command handlers is now converted to UTF-8 before it's returned.
|
20
|
+
- the FileHandler `text` method signature has been changed by adding an options
|
21
|
+
hash to support passing the max_plaintext_bytes limit.
|
22
|
+
|
23
|
+
|
9
24
|
## [0.2.0] - 2018-12-22
|
25
|
+
|
10
26
|
### Changed
|
11
27
|
- relaxed Nokogiri dependency to '~> 1.8' for Redmine 4 (which uses 1.9)
|
12
28
|
|
29
|
+
|
13
30
|
## [0.1.0] - 2018-02-15
|
14
31
|
- Initial release
|
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -109,6 +109,9 @@ catdoc:
|
|
109
109
|
fulltext = Plaintext::Resolver.new(file, content_type).text
|
110
110
|
```
|
111
111
|
|
112
|
+
To limit the number of bytes returned (default is 4MB), set the
|
113
|
+
`max_plaintext_bytes` property on the resolver instance before calling `text`.
|
114
|
+
|
112
115
|
## License
|
113
116
|
|
114
117
|
The `plaintext` gem is free software; you can redistribute it and/or modify it under the terms of the GNU General
|
@@ -11,5 +11,27 @@ module Plaintext
|
|
11
11
|
false
|
12
12
|
end
|
13
13
|
end
|
14
|
+
|
15
|
+
# use `#set(max_size: 1.megabyte)` to give an upper limit of data to be read.
|
16
|
+
#
|
17
|
+
# By default, all data (whole file / command output) will be read which can
|
18
|
+
# be a problem with huge text files (eg SQL dumps)
|
19
|
+
def set(args = {})
|
20
|
+
options.update args
|
21
|
+
self
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
|
26
|
+
# maximum number of bytes to read from external command output or text
|
27
|
+
# files
|
28
|
+
def max_size
|
29
|
+
options[:max_size]
|
30
|
+
end
|
31
|
+
|
32
|
+
def options
|
33
|
+
@options ||= {}
|
34
|
+
end
|
35
|
+
|
14
36
|
end
|
15
|
-
end
|
37
|
+
end
|
@@ -22,12 +22,13 @@ module Plaintext
|
|
22
22
|
|
23
23
|
FILE_PLACEHOLDER = '__FILE__'.freeze
|
24
24
|
|
25
|
-
def text(file)
|
25
|
+
def text(file, options = {})
|
26
26
|
cmd = @command.dup
|
27
27
|
cmd[cmd.index(FILE_PLACEHOLDER)] = Pathname(file).to_s
|
28
|
-
shellout(cmd){ |io|
|
28
|
+
shellout(cmd){ |io| read io, options[:max_size] }.to_s
|
29
29
|
end
|
30
30
|
|
31
|
+
|
31
32
|
def accept?(content_type)
|
32
33
|
super and available?
|
33
34
|
end
|
@@ -39,5 +40,11 @@ module Plaintext
|
|
39
40
|
def self.available?
|
40
41
|
new.available?
|
41
42
|
end
|
43
|
+
|
44
|
+
private
|
45
|
+
|
46
|
+
def read(io, max_size = nil)
|
47
|
+
Plaintext::CodesetUtil.to_utf8 io.read(max_size), "ASCII-8BIT"
|
48
|
+
end
|
42
49
|
end
|
43
50
|
end
|
@@ -3,11 +3,34 @@
|
|
3
3
|
module Plaintext
|
4
4
|
class RtfHandler < ExternalCommandHandler
|
5
5
|
DEFAULT = [
|
6
|
-
'/usr/bin/unrtf', '--text', '__FILE__'
|
6
|
+
'/usr/bin/unrtf', '--nopict', '--quiet', '--text', '__FILE__'
|
7
7
|
].freeze
|
8
8
|
def initialize
|
9
9
|
@content_type = 'application/rtf'
|
10
10
|
@command = Plaintext::Configuration['unrtf'] || DEFAULT
|
11
11
|
end
|
12
|
+
|
13
|
+
private
|
14
|
+
|
15
|
+
UNRTF_HEADER = "### Translation from RTF performed by UnRTF"
|
16
|
+
END_MARKER = "-----------------\n"
|
17
|
+
|
18
|
+
def read(io, max_size = nil)
|
19
|
+
if line = io.read(UNRTF_HEADER.length)
|
20
|
+
string = if line.starts_with? UNRTF_HEADER
|
21
|
+
io.gets while $_ != END_MARKER
|
22
|
+
io.read max_size
|
23
|
+
else
|
24
|
+
if max_size.nil?
|
25
|
+
line + io.read
|
26
|
+
elsif max_size > UNRTF_HEADER.length
|
27
|
+
line + io.read(max_size - UNRTF_HEADER.length)
|
28
|
+
else
|
29
|
+
line[0,max_size]
|
30
|
+
end
|
31
|
+
end
|
32
|
+
Plaintext::CodesetUtil.to_utf8 string, "ASCII-8BIT"
|
33
|
+
end
|
34
|
+
end
|
12
35
|
end
|
13
36
|
end
|
@@ -1,14 +1,16 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module Plaintext
|
4
|
+
|
4
5
|
class PlaintextHandler < FileHandler
|
5
6
|
CONTENT_TYPES = %w(text/csv text/plain)
|
6
7
|
def initialize
|
7
8
|
@content_types = CONTENT_TYPES
|
8
9
|
end
|
9
10
|
|
10
|
-
def text(file)
|
11
|
-
|
11
|
+
def text(file, options = {})
|
12
|
+
max_size = options[:max_size]
|
13
|
+
Plaintext::CodesetUtil.to_utf8 IO.read(file, max_size), 'UTF-8'
|
12
14
|
end
|
13
15
|
end
|
14
|
-
end
|
16
|
+
end
|
@@ -37,7 +37,7 @@ module Plaintext
|
|
37
37
|
end
|
38
38
|
end
|
39
39
|
|
40
|
-
def text(file)
|
40
|
+
def text(file, options = {})
|
41
41
|
Zip::File.open(file) do |zip_file|
|
42
42
|
zip_file.each do |entry|
|
43
43
|
if entry.name == @file_name
|
@@ -55,4 +55,4 @@ module Plaintext
|
|
55
55
|
sax_doc.text
|
56
56
|
end
|
57
57
|
end
|
58
|
-
end
|
58
|
+
end
|
@@ -14,7 +14,7 @@ module Plaintext
|
|
14
14
|
@namespace_uri = 'http://schemas.openxmlformats.org/drawingml/2006/main'
|
15
15
|
end
|
16
16
|
|
17
|
-
def text(file)
|
17
|
+
def text(file, options = {})
|
18
18
|
slides = []
|
19
19
|
Zip::File.open(file) do |zip_file|
|
20
20
|
zip_file.each do |entry|
|
@@ -27,4 +27,4 @@ module Plaintext
|
|
27
27
|
slides.map(&:last).join ' '
|
28
28
|
end
|
29
29
|
end
|
30
|
-
end
|
30
|
+
end
|
data/lib/plaintext/resolver.rb
CHANGED
@@ -2,7 +2,9 @@
|
|
2
2
|
|
3
3
|
module Plaintext
|
4
4
|
class Resolver
|
5
|
-
|
5
|
+
|
6
|
+
# maximum length of returned plain text in bytes. Default: 4MB
|
7
|
+
attr_accessor :max_plaintext_bytes
|
6
8
|
|
7
9
|
class << self
|
8
10
|
attr_accessor :cached_file_handlers
|
@@ -26,15 +28,19 @@ module Plaintext
|
|
26
28
|
def initialize(file, content_type = nil)
|
27
29
|
@file = file
|
28
30
|
@content_type = content_type
|
31
|
+
@max_plaintext_bytes = 4_194_304 # 4 megabytes
|
29
32
|
end
|
30
33
|
|
34
|
+
|
31
35
|
# Returns the extracted fulltext or nil if no matching handler was found
|
32
36
|
# for the file type.
|
33
37
|
def text
|
34
|
-
if handler = find_handler and
|
35
|
-
|
38
|
+
if handler = find_handler and
|
39
|
+
text = handler.text(@file, max_size: max_plaintext_bytes)
|
40
|
+
|
41
|
+
text.gsub!(/\s+/m, ' ')
|
36
42
|
text.strip!
|
37
|
-
text.mb_chars.compose.limit(
|
43
|
+
text.mb_chars.compose.limit(max_plaintext_bytes).to_s
|
38
44
|
end
|
39
45
|
end
|
40
46
|
|
data/lib/plaintext/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: plaintext
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jens Krämer
|
@@ -10,7 +10,7 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: exe
|
12
12
|
cert_chain: []
|
13
|
-
date:
|
13
|
+
date: 2019-01-09 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: rubyzip
|
@@ -155,7 +155,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
155
155
|
version: '0'
|
156
156
|
requirements: []
|
157
157
|
rubyforge_project:
|
158
|
-
rubygems_version: 2.
|
158
|
+
rubygems_version: 2.5.2.1
|
159
159
|
signing_key:
|
160
160
|
specification_version: 4
|
161
161
|
summary: Extract plain text from most common office documents.
|