plaintext 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG +17 -0
- data/Gemfile +3 -1
- data/README.md +3 -0
- data/lib/plaintext/file_handler.rb +23 -1
- data/lib/plaintext/file_handler/external_command_handler.rb +9 -2
- data/lib/plaintext/file_handler/external_command_handler/rtf_handler.rb +24 -1
- data/lib/plaintext/file_handler/plaintext_handler.rb +5 -3
- data/lib/plaintext/file_handler/zipped_xml_handler.rb +2 -2
- data/lib/plaintext/file_handler/zipped_xml_handler/office_document_handler/pptx_handler.rb +2 -2
- data/lib/plaintext/resolver.rb +10 -4
- data/lib/plaintext/version.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: eb87c916a0f34e20251b5a97cbcf1e354fc046b0
|
4
|
+
data.tar.gz: e6dff85b7b0cec1ae3fdebbaabb950be1b563215
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 48be94f65a07c3e400bacfae575758494afbf3d6004b19a99f7838a032b26d4503ae3ea47b46548de0dd34b4892c551d5d7af3620a72375ae9a771afc0601cc2
|
7
|
+
data.tar.gz: 0b5dc9aa9abae205083d3dd4dcc89572bf36361fee098c6145472735ac6d10c810b07595be095565baa18b99534b61bbbc63b4e26c445ccdac94a3468b6c962c
|
data/CHANGELOG
CHANGED
@@ -6,9 +6,26 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
6
6
|
|
7
7
|
## [Unreleased]
|
8
8
|
|
9
|
+
## [0.3.0] - 2019-01-09
|
10
|
+
|
11
|
+
### Added
|
12
|
+
- `:max_plaintext_bytes` option to place an upper limit on the number of bytes
|
13
|
+
returned. Also limits the amount of data that is actually read from plain text
|
14
|
+
files and external command output to limit memory usage. Set this on the
|
15
|
+
resolver object.
|
16
|
+
|
17
|
+
### Changed
|
18
|
+
- the unrtf handler now strips the preamble which unrtf adds to its output.
|
19
|
+
- text from external command handlers is now converted to UTF-8 before it's returned.
|
20
|
+
- the FileHandler `text` method signature has been changed by adding an options
|
21
|
+
hash to support passing the max_plaintext_bytes limit.
|
22
|
+
|
23
|
+
|
9
24
|
## [0.2.0] - 2018-12-22
|
25
|
+
|
10
26
|
### Changed
|
11
27
|
- relaxed Nokogiri dependency to '~> 1.8' for Redmine 4 (which uses 1.9)
|
12
28
|
|
29
|
+
|
13
30
|
## [0.1.0] - 2018-02-15
|
14
31
|
- Initial release
|
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -109,6 +109,9 @@ catdoc:
|
|
109
109
|
fulltext = Plaintext::Resolver.new(file, content_type).text
|
110
110
|
```
|
111
111
|
|
112
|
+
To limit the number of bytes returned (default is 4MB), set the
|
113
|
+
`max_plaintext_bytes` property on the resolver instance before calling `text`.
|
114
|
+
|
112
115
|
## License
|
113
116
|
|
114
117
|
The `plaintext` gem is free software; you can redistribute it and/or modify it under the terms of the GNU General
|
@@ -11,5 +11,27 @@ module Plaintext
|
|
11
11
|
false
|
12
12
|
end
|
13
13
|
end
|
14
|
+
|
15
|
+
# use `#set(max_size: 1.megabyte)` to give an upper limit of data to be read.
|
16
|
+
#
|
17
|
+
# By default, all data (whole file / command output) will be read which can
|
18
|
+
# be a problem with huge text files (eg SQL dumps)
|
19
|
+
def set(args = {})
|
20
|
+
options.update args
|
21
|
+
self
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
|
26
|
+
# maximum number of bytes to read from external command output or text
|
27
|
+
# files
|
28
|
+
def max_size
|
29
|
+
options[:max_size]
|
30
|
+
end
|
31
|
+
|
32
|
+
def options
|
33
|
+
@options ||= {}
|
34
|
+
end
|
35
|
+
|
14
36
|
end
|
15
|
-
end
|
37
|
+
end
|
@@ -22,12 +22,13 @@ module Plaintext
|
|
22
22
|
|
23
23
|
FILE_PLACEHOLDER = '__FILE__'.freeze
|
24
24
|
|
25
|
-
def text(file)
|
25
|
+
def text(file, options = {})
|
26
26
|
cmd = @command.dup
|
27
27
|
cmd[cmd.index(FILE_PLACEHOLDER)] = Pathname(file).to_s
|
28
|
-
shellout(cmd){ |io|
|
28
|
+
shellout(cmd){ |io| read io, options[:max_size] }.to_s
|
29
29
|
end
|
30
30
|
|
31
|
+
|
31
32
|
def accept?(content_type)
|
32
33
|
super and available?
|
33
34
|
end
|
@@ -39,5 +40,11 @@ module Plaintext
|
|
39
40
|
def self.available?
|
40
41
|
new.available?
|
41
42
|
end
|
43
|
+
|
44
|
+
private
|
45
|
+
|
46
|
+
def read(io, max_size = nil)
|
47
|
+
Plaintext::CodesetUtil.to_utf8 io.read(max_size), "ASCII-8BIT"
|
48
|
+
end
|
42
49
|
end
|
43
50
|
end
|
@@ -3,11 +3,34 @@
|
|
3
3
|
module Plaintext
|
4
4
|
class RtfHandler < ExternalCommandHandler
|
5
5
|
DEFAULT = [
|
6
|
-
'/usr/bin/unrtf', '--text', '__FILE__'
|
6
|
+
'/usr/bin/unrtf', '--nopict', '--quiet', '--text', '__FILE__'
|
7
7
|
].freeze
|
8
8
|
def initialize
|
9
9
|
@content_type = 'application/rtf'
|
10
10
|
@command = Plaintext::Configuration['unrtf'] || DEFAULT
|
11
11
|
end
|
12
|
+
|
13
|
+
private
|
14
|
+
|
15
|
+
UNRTF_HEADER = "### Translation from RTF performed by UnRTF"
|
16
|
+
END_MARKER = "-----------------\n"
|
17
|
+
|
18
|
+
def read(io, max_size = nil)
|
19
|
+
if line = io.read(UNRTF_HEADER.length)
|
20
|
+
string = if line.starts_with? UNRTF_HEADER
|
21
|
+
io.gets while $_ != END_MARKER
|
22
|
+
io.read max_size
|
23
|
+
else
|
24
|
+
if max_size.nil?
|
25
|
+
line + io.read
|
26
|
+
elsif max_size > UNRTF_HEADER.length
|
27
|
+
line + io.read(max_size - UNRTF_HEADER.length)
|
28
|
+
else
|
29
|
+
line[0,max_size]
|
30
|
+
end
|
31
|
+
end
|
32
|
+
Plaintext::CodesetUtil.to_utf8 string, "ASCII-8BIT"
|
33
|
+
end
|
34
|
+
end
|
12
35
|
end
|
13
36
|
end
|
@@ -1,14 +1,16 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module Plaintext
|
4
|
+
|
4
5
|
class PlaintextHandler < FileHandler
|
5
6
|
CONTENT_TYPES = %w(text/csv text/plain)
|
6
7
|
def initialize
|
7
8
|
@content_types = CONTENT_TYPES
|
8
9
|
end
|
9
10
|
|
10
|
-
def text(file)
|
11
|
-
|
11
|
+
def text(file, options = {})
|
12
|
+
max_size = options[:max_size]
|
13
|
+
Plaintext::CodesetUtil.to_utf8 IO.read(file, max_size), 'UTF-8'
|
12
14
|
end
|
13
15
|
end
|
14
|
-
end
|
16
|
+
end
|
@@ -37,7 +37,7 @@ module Plaintext
|
|
37
37
|
end
|
38
38
|
end
|
39
39
|
|
40
|
-
def text(file)
|
40
|
+
def text(file, options = {})
|
41
41
|
Zip::File.open(file) do |zip_file|
|
42
42
|
zip_file.each do |entry|
|
43
43
|
if entry.name == @file_name
|
@@ -55,4 +55,4 @@ module Plaintext
|
|
55
55
|
sax_doc.text
|
56
56
|
end
|
57
57
|
end
|
58
|
-
end
|
58
|
+
end
|
@@ -14,7 +14,7 @@ module Plaintext
|
|
14
14
|
@namespace_uri = 'http://schemas.openxmlformats.org/drawingml/2006/main'
|
15
15
|
end
|
16
16
|
|
17
|
-
def text(file)
|
17
|
+
def text(file, options = {})
|
18
18
|
slides = []
|
19
19
|
Zip::File.open(file) do |zip_file|
|
20
20
|
zip_file.each do |entry|
|
@@ -27,4 +27,4 @@ module Plaintext
|
|
27
27
|
slides.map(&:last).join ' '
|
28
28
|
end
|
29
29
|
end
|
30
|
-
end
|
30
|
+
end
|
data/lib/plaintext/resolver.rb
CHANGED
@@ -2,7 +2,9 @@
|
|
2
2
|
|
3
3
|
module Plaintext
|
4
4
|
class Resolver
|
5
|
-
|
5
|
+
|
6
|
+
# maximum length of returned plain text in bytes. Default: 4MB
|
7
|
+
attr_accessor :max_plaintext_bytes
|
6
8
|
|
7
9
|
class << self
|
8
10
|
attr_accessor :cached_file_handlers
|
@@ -26,15 +28,19 @@ module Plaintext
|
|
26
28
|
def initialize(file, content_type = nil)
|
27
29
|
@file = file
|
28
30
|
@content_type = content_type
|
31
|
+
@max_plaintext_bytes = 4_194_304 # 4 megabytes
|
29
32
|
end
|
30
33
|
|
34
|
+
|
31
35
|
# Returns the extracted fulltext or nil if no matching handler was found
|
32
36
|
# for the file type.
|
33
37
|
def text
|
34
|
-
if handler = find_handler and
|
35
|
-
|
38
|
+
if handler = find_handler and
|
39
|
+
text = handler.text(@file, max_size: max_plaintext_bytes)
|
40
|
+
|
41
|
+
text.gsub!(/\s+/m, ' ')
|
36
42
|
text.strip!
|
37
|
-
text.mb_chars.compose.limit(
|
43
|
+
text.mb_chars.compose.limit(max_plaintext_bytes).to_s
|
38
44
|
end
|
39
45
|
end
|
40
46
|
|
data/lib/plaintext/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: plaintext
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jens Krämer
|
@@ -10,7 +10,7 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: exe
|
12
12
|
cert_chain: []
|
13
|
-
date:
|
13
|
+
date: 2019-01-09 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: rubyzip
|
@@ -155,7 +155,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
155
155
|
version: '0'
|
156
156
|
requirements: []
|
157
157
|
rubyforge_project:
|
158
|
-
rubygems_version: 2.
|
158
|
+
rubygems_version: 2.5.2.1
|
159
159
|
signing_key:
|
160
160
|
specification_version: 4
|
161
161
|
summary: Extract plain text from most common office documents.
|