chupa-text 1.0.6 → 1.0.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/doc/text/news.md +16 -0
- data/lib/chupa-text.rb +2 -0
- data/lib/chupa-text/command/chupa-text.rb +33 -4
- data/lib/chupa-text/configuration.rb +2 -3
- data/lib/chupa-text/data.rb +25 -1
- data/lib/chupa-text/decomposers/csv.rb +44 -5
- data/lib/chupa-text/decomposers/gzip.rb +1 -2
- data/lib/chupa-text/decomposers/tar.rb +2 -2
- data/lib/chupa-text/decomposers/xml.rb +1 -2
- data/lib/chupa-text/formatters/hash.rb +10 -0
- data/lib/chupa-text/screenshot.rb +46 -0
- data/lib/chupa-text/version.rb +1 -1
- data/test/command/test-chupa-text.rb +48 -0
- data/test/fixture/command/chupa-text/numbers.csv +3 -0
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9a50556c287fd5148a3032b4fd8518679237e7be
|
4
|
+
data.tar.gz: 0d58a4948b3df081449e67211e23168b539be83b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 298a7416122a757fbb6018c73327f768adaa6b18402f7725e5c9dde12b320d00a7015c04a5eb90580b4c6f822678acded64e132a8ba9d8d1b1de792804d13103
|
7
|
+
data.tar.gz: 3aaaa8b120e085434094163d3e582cb4bb33c5478a2916a268cd5b03cfff10b48cd05f6912acb26185e288650cc656a3f38260a48468fd988de74f0dff59bc1e
|
data/doc/text/news.md
CHANGED
@@ -1,5 +1,21 @@
|
|
1
1
|
# News
|
2
2
|
|
3
|
+
## 1.0.7: 2017-07-06
|
4
|
+
|
5
|
+
### Improvements
|
6
|
+
|
7
|
+
* Supported screenshot.
|
8
|
+
|
9
|
+
* `chupa-text`: Added new options:
|
10
|
+
|
11
|
+
* `--need-screenshot`
|
12
|
+
|
13
|
+
* `--expected-screenshot-size=WIDTHxHEIGHT`
|
14
|
+
|
15
|
+
### Fixes
|
16
|
+
|
17
|
+
* CSV decomposer: Fixed a infinite loop bug.
|
18
|
+
|
3
19
|
## 1.0.6: 2017-07-05
|
4
20
|
|
5
21
|
### Improvements
|
data/lib/chupa-text.rb
CHANGED
@@ -28,11 +28,24 @@ module ChupaText
|
|
28
28
|
|
29
29
|
AVAILABLE_FORMATS = [:json, :text]
|
30
30
|
|
31
|
+
SIZE = /\A\d+x\d+\z/o
|
32
|
+
OptionParser.accept(SIZE, SIZE) do |value|
|
33
|
+
if value
|
34
|
+
begin
|
35
|
+
value.split("x").collect {|number| Integer(number)}
|
36
|
+
rescue ArgumentError
|
37
|
+
raise OptionParser::InvalidArgument, value
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
31
42
|
def initialize
|
32
43
|
@input = nil
|
33
|
-
@configuration = Configuration.
|
44
|
+
@configuration = Configuration.load_default
|
34
45
|
@enable_gems = true
|
35
46
|
@format = :json
|
47
|
+
@need_screenshot = true
|
48
|
+
@expected_screenshot_size = [200, 200]
|
36
49
|
end
|
37
50
|
|
38
51
|
def run(*arguments)
|
@@ -92,12 +105,25 @@ module ChupaText
|
|
92
105
|
"Appends PATH to decomposer load path.") do |path|
|
93
106
|
$LOAD_PATH << path
|
94
107
|
end
|
108
|
+
|
109
|
+
parser.separator("")
|
110
|
+
parser.separator("Output related options")
|
95
111
|
parser.on("--format=FORMAT", AVAILABLE_FORMATS,
|
96
112
|
"Output FORMAT.",
|
97
113
|
"[#{AVAILABLE_FORMATS.join(', ')}]",
|
98
|
-
"(default:
|
114
|
+
"(default: #{@format})") do |format|
|
99
115
|
@format = format
|
100
116
|
end
|
117
|
+
parser.on("--[no-]need-screenshot",
|
118
|
+
"Generate screenshot if available.",
|
119
|
+
"(default: #{@need_screenshot})") do |boolean|
|
120
|
+
@need_screenshot = boolean
|
121
|
+
end
|
122
|
+
parser.on("--expected-screenshot-size=WIDTHxHEIGHT", SIZE,
|
123
|
+
"Expected screenshot size.",
|
124
|
+
"(default: #{@expected_screenshot_size.join("x")})") do |size|
|
125
|
+
@expected_screenshot_size = size
|
126
|
+
end
|
101
127
|
|
102
128
|
parser.separator("")
|
103
129
|
parser.separator("Log related options:")
|
@@ -152,7 +178,7 @@ module ChupaText
|
|
152
178
|
|
153
179
|
def create_data
|
154
180
|
if @input.nil?
|
155
|
-
VirtualFileData.new(nil, $stdin)
|
181
|
+
data = VirtualFileData.new(nil, $stdin)
|
156
182
|
else
|
157
183
|
case @input
|
158
184
|
when /\A[a-z]+:\/\//i
|
@@ -160,8 +186,11 @@ module ChupaText
|
|
160
186
|
else
|
161
187
|
input = Pathname(@input)
|
162
188
|
end
|
163
|
-
InputData.new(input)
|
189
|
+
data = InputData.new(input)
|
164
190
|
end
|
191
|
+
data.need_screenshot = @need_screenshot
|
192
|
+
data.expected_screenshot_size = @expected_screenshot_size
|
193
|
+
data
|
165
194
|
end
|
166
195
|
|
167
196
|
def create_formatter
|
@@ -18,11 +18,10 @@ module ChupaText
|
|
18
18
|
class Configuration
|
19
19
|
class << self
|
20
20
|
def default
|
21
|
-
@default ||=
|
21
|
+
@default ||= load_default
|
22
22
|
end
|
23
23
|
|
24
|
-
|
25
|
-
def create_default
|
24
|
+
def load_default
|
26
25
|
configuration = new
|
27
26
|
loader = ConfigurationLoader.new(configuration)
|
28
27
|
loader.load("chupa-text.conf")
|
data/lib/chupa-text/data.rb
CHANGED
@@ -52,6 +52,17 @@ module ChupaText
|
|
52
52
|
# archive data in {#source}.
|
53
53
|
attr_accessor :source
|
54
54
|
|
55
|
+
# @return [Screenshot, nil] The screenshot of the data. For example,
|
56
|
+
# the first page image for PDF file.text.
|
57
|
+
attr_accessor :screenshot
|
58
|
+
|
59
|
+
# @param [Bool] value `true` when screenshot is needed.
|
60
|
+
# @return [Bool] the specified value
|
61
|
+
attr_writer :need_screenshot
|
62
|
+
|
63
|
+
# @return [Array<Integer, Integer>] the expected screenshot size.
|
64
|
+
attr_accessor :expected_screenshot_size
|
65
|
+
|
55
66
|
def initialize(options={})
|
56
67
|
@uri = nil
|
57
68
|
@body = nil
|
@@ -60,9 +71,15 @@ module ChupaText
|
|
60
71
|
@mime_type = nil
|
61
72
|
@attributes = Attributes.new
|
62
73
|
@source = nil
|
74
|
+
@screenshot = nil
|
75
|
+
@need_screenshot = true
|
76
|
+
@expected_screenshot_size = [200, 200]
|
63
77
|
@options = options || {}
|
64
78
|
source_data = @options[:source_data]
|
65
|
-
|
79
|
+
if source_data
|
80
|
+
merge!(source_data)
|
81
|
+
@source = source_data
|
82
|
+
end
|
66
83
|
end
|
67
84
|
|
68
85
|
def initialize_copy(object)
|
@@ -86,6 +103,8 @@ module ChupaText
|
|
86
103
|
self["source-mime-types"] ||= []
|
87
104
|
self["source-mime-types"].unshift(data.mime_type)
|
88
105
|
end
|
106
|
+
self.need_screenshot = data.need_screenshot?
|
107
|
+
self.expected_screenshot_size = data.expected_screenshot_size
|
89
108
|
end
|
90
109
|
|
91
110
|
# @param [String, URI, nil] uri The URI for the data. If `uri` is
|
@@ -162,6 +181,11 @@ module ChupaText
|
|
162
181
|
mime_type == "text/plain"
|
163
182
|
end
|
164
183
|
|
184
|
+
# @return [Bool] `true` when screenshot is needed if available.
|
185
|
+
def need_screenshot?
|
186
|
+
@need_screenshot
|
187
|
+
end
|
188
|
+
|
165
189
|
private
|
166
190
|
def guess_mime_type
|
167
191
|
guess_mime_type_from_uri or
|
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
|
1
|
+
# Copyright (C) 2013-2017 Kouhei Sutou <kou@clear-code.com>
|
2
2
|
#
|
3
3
|
# This library is free software; you can redistribute it and/or
|
4
4
|
# modify it under the terms of the GNU Lesser General Public
|
@@ -14,6 +14,7 @@
|
|
14
14
|
# License along with this library; if not, write to the Free Software
|
15
15
|
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
16
16
|
|
17
|
+
require "cgi/util"
|
17
18
|
require "csv"
|
18
19
|
|
19
20
|
module ChupaText
|
@@ -22,8 +23,14 @@ module ChupaText
|
|
22
23
|
registry.register("csv", self)
|
23
24
|
|
24
25
|
def target?(data)
|
25
|
-
data.
|
26
|
-
|
26
|
+
return true if data.mime_type == "text/csv"
|
27
|
+
|
28
|
+
if data.text_plain? and
|
29
|
+
(data["source-mime-types"] || []).include?("text/csv")
|
30
|
+
return false
|
31
|
+
end
|
32
|
+
|
33
|
+
data.extension == "csv"
|
27
34
|
end
|
28
35
|
|
29
36
|
def decompose(data)
|
@@ -35,10 +42,42 @@ module ChupaText
|
|
35
42
|
text << "\n"
|
36
43
|
end
|
37
44
|
end
|
38
|
-
|
39
|
-
text_data
|
45
|
+
|
46
|
+
text_data = TextData.new(text, :source_data => data)
|
47
|
+
if data.need_screenshot?
|
48
|
+
text_data.screenshot = create_screenshot(data, text)
|
49
|
+
end
|
50
|
+
|
40
51
|
yield(text_data)
|
41
52
|
end
|
53
|
+
|
54
|
+
private
|
55
|
+
def create_screenshot(data, text)
|
56
|
+
width, height = data.expected_screenshot_size
|
57
|
+
max_n_lines = 10
|
58
|
+
font_size = height / max_n_lines
|
59
|
+
target_text = ""
|
60
|
+
text.each_line.with_index do |line, i|
|
61
|
+
break if i == max_n_lines
|
62
|
+
target_text << line
|
63
|
+
end
|
64
|
+
mime_type = "image/svg+xml"
|
65
|
+
data = <<-SVG
|
66
|
+
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
67
|
+
<svg
|
68
|
+
xmlns="http://www.w3.org/2000/svg"
|
69
|
+
width="#{width}"
|
70
|
+
height="#{height}"
|
71
|
+
viewBox="0 0 #{width} #{height}">
|
72
|
+
<text
|
73
|
+
x="0"
|
74
|
+
y="#{font_size}"
|
75
|
+
style="font-size: #{font_size}px; white-space: pre-wrap;"
|
76
|
+
xml:space="preserve">#{CGI.escapeHTML(target_text)}</text>
|
77
|
+
</svg>
|
78
|
+
SVG
|
79
|
+
Screenshot.new(mime_type, data)
|
80
|
+
end
|
42
81
|
end
|
43
82
|
end
|
44
83
|
end
|
@@ -42,8 +42,7 @@ module ChupaText
|
|
42
42
|
when "tgz"
|
43
43
|
uri = data.uri.to_s.gsub(/\.tgz\z/i, ".tar")
|
44
44
|
end
|
45
|
-
extracted = VirtualFileData.new(uri, reader)
|
46
|
-
extracted.source = data
|
45
|
+
extracted = VirtualFileData.new(uri, reader, :source_data => data)
|
47
46
|
yield(extracted)
|
48
47
|
end
|
49
48
|
end
|
@@ -32,8 +32,8 @@ module ChupaText
|
|
32
32
|
reader.each do |entry|
|
33
33
|
next unless entry.file?
|
34
34
|
entry.extend(CopyStreamable)
|
35
|
-
extracted = VirtualFileData.new(entry.full_name, entry
|
36
|
-
|
35
|
+
extracted = VirtualFileData.new(entry.full_name, entry,
|
36
|
+
:source_data => data)
|
37
37
|
yield(extracted)
|
38
38
|
end
|
39
39
|
end
|
@@ -28,6 +28,16 @@ module ChupaText
|
|
28
28
|
text = {}
|
29
29
|
format_headers(data, text)
|
30
30
|
text["body"] = data.body
|
31
|
+
screenshot = data.screenshot
|
32
|
+
if screenshot
|
33
|
+
text["screenshot"] = {
|
34
|
+
"mime-type" => screenshot.mime_type,
|
35
|
+
"data" => screenshot.data,
|
36
|
+
}
|
37
|
+
if screenshot.encoding
|
38
|
+
text["screenshot"]["encoding"] = screenshot.encoding
|
39
|
+
end
|
40
|
+
end
|
31
41
|
@texts << text
|
32
42
|
end
|
33
43
|
|
@@ -0,0 +1,46 @@
|
|
1
|
+
# Copyright (C) 2017 Kouhei Sutou <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# This library is free software; you can redistribute it and/or
|
4
|
+
# modify it under the terms of the GNU Lesser General Public
|
5
|
+
# License as published by the Free Software Foundation; either
|
6
|
+
# version 2.1 of the License, or (at your option) any later version.
|
7
|
+
#
|
8
|
+
# This library is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
11
|
+
# Lesser General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU Lesser General Public
|
14
|
+
# License along with this library; if not, write to the Free Software
|
15
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
16
|
+
|
17
|
+
module ChupaText
|
18
|
+
class Screenshot
|
19
|
+
# @return [String] The MIME type of the screenshot.
|
20
|
+
attr_reader :mime_type
|
21
|
+
|
22
|
+
# @return [String] The data of the screenshot.
|
23
|
+
attr_reader :data
|
24
|
+
|
25
|
+
# @return [String, nil] The encoding of the screenshot data.
|
26
|
+
# `nil` means that the data is raw data. It's used for SVG data
|
27
|
+
# because it's text data. `"base64"` means that the data is encoded
|
28
|
+
# by Base64. It's used for PNG data because it's binary data.
|
29
|
+
attr_reader :encoding
|
30
|
+
|
31
|
+
def initialize(mime_type, data, encoding=nil)
|
32
|
+
@mime_type = mime_type
|
33
|
+
@data = data
|
34
|
+
@encoding = encoding
|
35
|
+
end
|
36
|
+
|
37
|
+
def decoded_data
|
38
|
+
case @encoding
|
39
|
+
when "base64"
|
40
|
+
@data.unpack("m*")[0]
|
41
|
+
else
|
42
|
+
@data
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
data/lib/chupa-text/version.rb
CHANGED
@@ -182,4 +182,52 @@ class TestCommandChupaText < Test::Unit::TestCase
|
|
182
182
|
path.to_s))
|
183
183
|
end
|
184
184
|
end
|
185
|
+
|
186
|
+
sub_test_case("extract") do
|
187
|
+
def test_csv
|
188
|
+
fixture_name = "numbers.csv"
|
189
|
+
uri = fixture_uri(fixture_name)
|
190
|
+
path = fixture_path(fixture_name)
|
191
|
+
assert_equal([
|
192
|
+
true,
|
193
|
+
{
|
194
|
+
"uri" => uri.to_s,
|
195
|
+
"path" => path.to_s,
|
196
|
+
"mime-type" => "text/csv",
|
197
|
+
"size" => path.stat.size,
|
198
|
+
"texts" => [
|
199
|
+
{
|
200
|
+
"uri" => uri.to_s,
|
201
|
+
"path" => path.to_s,
|
202
|
+
"mime-type" => "text/plain",
|
203
|
+
"source-mime-types" => ["text/csv"],
|
204
|
+
"body" => "1 2 3\n4 5 6\n7 8 9\n",
|
205
|
+
"size" => 18,
|
206
|
+
"screenshot" => {
|
207
|
+
"mime-type" => "image/svg+xml",
|
208
|
+
"data" => <<-SVG
|
209
|
+
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
210
|
+
<svg
|
211
|
+
xmlns="http://www.w3.org/2000/svg"
|
212
|
+
width="200"
|
213
|
+
height="200"
|
214
|
+
viewBox="0 0 200 200">
|
215
|
+
<text
|
216
|
+
x="0"
|
217
|
+
y="20"
|
218
|
+
style="font-size: 20px; white-space: pre-wrap;"
|
219
|
+
xml:space="preserve">1 2 3
|
220
|
+
4 5 6
|
221
|
+
7 8 9
|
222
|
+
</text>
|
223
|
+
</svg>
|
224
|
+
SVG
|
225
|
+
},
|
226
|
+
},
|
227
|
+
],
|
228
|
+
},
|
229
|
+
],
|
230
|
+
run_command(path.to_s))
|
231
|
+
end
|
232
|
+
end
|
185
233
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: chupa-text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kouhei Sutou
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-07-
|
11
|
+
date: 2017-07-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -132,6 +132,7 @@ files:
|
|
132
132
|
- lib/chupa-text/logger.rb
|
133
133
|
- lib/chupa-text/mime-type-registry.rb
|
134
134
|
- lib/chupa-text/mime-type.rb
|
135
|
+
- lib/chupa-text/screenshot.rb
|
135
136
|
- lib/chupa-text/size-parser.rb
|
136
137
|
- lib/chupa-text/text-data.rb
|
137
138
|
- lib/chupa-text/version.rb
|
@@ -145,6 +146,7 @@ files:
|
|
145
146
|
- test/fixture/command/chupa-text/hello.txt
|
146
147
|
- test/fixture/command/chupa-text/hello.txt.gz
|
147
148
|
- test/fixture/command/chupa-text/no-decomposer.conf
|
149
|
+
- test/fixture/command/chupa-text/numbers.csv
|
148
150
|
- test/fixture/extractor/hello.txt
|
149
151
|
- test/fixture/gzip/hello.tar.gz
|
150
152
|
- test/fixture/gzip/hello.tgz
|