chupa-text 1.0.6 → 1.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/doc/text/news.md +16 -0
- data/lib/chupa-text.rb +2 -0
- data/lib/chupa-text/command/chupa-text.rb +33 -4
- data/lib/chupa-text/configuration.rb +2 -3
- data/lib/chupa-text/data.rb +25 -1
- data/lib/chupa-text/decomposers/csv.rb +44 -5
- data/lib/chupa-text/decomposers/gzip.rb +1 -2
- data/lib/chupa-text/decomposers/tar.rb +2 -2
- data/lib/chupa-text/decomposers/xml.rb +1 -2
- data/lib/chupa-text/formatters/hash.rb +10 -0
- data/lib/chupa-text/screenshot.rb +46 -0
- data/lib/chupa-text/version.rb +1 -1
- data/test/command/test-chupa-text.rb +48 -0
- data/test/fixture/command/chupa-text/numbers.csv +3 -0
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9a50556c287fd5148a3032b4fd8518679237e7be
|
4
|
+
data.tar.gz: 0d58a4948b3df081449e67211e23168b539be83b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 298a7416122a757fbb6018c73327f768adaa6b18402f7725e5c9dde12b320d00a7015c04a5eb90580b4c6f822678acded64e132a8ba9d8d1b1de792804d13103
|
7
|
+
data.tar.gz: 3aaaa8b120e085434094163d3e582cb4bb33c5478a2916a268cd5b03cfff10b48cd05f6912acb26185e288650cc656a3f38260a48468fd988de74f0dff59bc1e
|
data/doc/text/news.md
CHANGED
@@ -1,5 +1,21 @@
|
|
1
1
|
# News
|
2
2
|
|
3
|
+
## 1.0.7: 2017-07-06
|
4
|
+
|
5
|
+
### Improvements
|
6
|
+
|
7
|
+
* Supported screenshot.
|
8
|
+
|
9
|
+
* `chupa-text`: Added new options:
|
10
|
+
|
11
|
+
* `--need-screenshot`
|
12
|
+
|
13
|
+
* `--expected-screenshot-size=WIDTHxHEIGHT`
|
14
|
+
|
15
|
+
### Fixes
|
16
|
+
|
17
|
+
* CSV decomposer: Fixed a infinite loop bug.
|
18
|
+
|
3
19
|
## 1.0.6: 2017-07-05
|
4
20
|
|
5
21
|
### Improvements
|
data/lib/chupa-text.rb
CHANGED
@@ -28,11 +28,24 @@ module ChupaText
|
|
28
28
|
|
29
29
|
AVAILABLE_FORMATS = [:json, :text]
|
30
30
|
|
31
|
+
SIZE = /\A\d+x\d+\z/o
|
32
|
+
OptionParser.accept(SIZE, SIZE) do |value|
|
33
|
+
if value
|
34
|
+
begin
|
35
|
+
value.split("x").collect {|number| Integer(number)}
|
36
|
+
rescue ArgumentError
|
37
|
+
raise OptionParser::InvalidArgument, value
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
31
42
|
def initialize
|
32
43
|
@input = nil
|
33
|
-
@configuration = Configuration.
|
44
|
+
@configuration = Configuration.load_default
|
34
45
|
@enable_gems = true
|
35
46
|
@format = :json
|
47
|
+
@need_screenshot = true
|
48
|
+
@expected_screenshot_size = [200, 200]
|
36
49
|
end
|
37
50
|
|
38
51
|
def run(*arguments)
|
@@ -92,12 +105,25 @@ module ChupaText
|
|
92
105
|
"Appends PATH to decomposer load path.") do |path|
|
93
106
|
$LOAD_PATH << path
|
94
107
|
end
|
108
|
+
|
109
|
+
parser.separator("")
|
110
|
+
parser.separator("Output related options")
|
95
111
|
parser.on("--format=FORMAT", AVAILABLE_FORMATS,
|
96
112
|
"Output FORMAT.",
|
97
113
|
"[#{AVAILABLE_FORMATS.join(', ')}]",
|
98
|
-
"(default:
|
114
|
+
"(default: #{@format})") do |format|
|
99
115
|
@format = format
|
100
116
|
end
|
117
|
+
parser.on("--[no-]need-screenshot",
|
118
|
+
"Generate screenshot if available.",
|
119
|
+
"(default: #{@need_screenshot})") do |boolean|
|
120
|
+
@need_screenshot = boolean
|
121
|
+
end
|
122
|
+
parser.on("--expected-screenshot-size=WIDTHxHEIGHT", SIZE,
|
123
|
+
"Expected screenshot size.",
|
124
|
+
"(default: #{@expected_screenshot_size.join("x")})") do |size|
|
125
|
+
@expected_screenshot_size = size
|
126
|
+
end
|
101
127
|
|
102
128
|
parser.separator("")
|
103
129
|
parser.separator("Log related options:")
|
@@ -152,7 +178,7 @@ module ChupaText
|
|
152
178
|
|
153
179
|
def create_data
|
154
180
|
if @input.nil?
|
155
|
-
VirtualFileData.new(nil, $stdin)
|
181
|
+
data = VirtualFileData.new(nil, $stdin)
|
156
182
|
else
|
157
183
|
case @input
|
158
184
|
when /\A[a-z]+:\/\//i
|
@@ -160,8 +186,11 @@ module ChupaText
|
|
160
186
|
else
|
161
187
|
input = Pathname(@input)
|
162
188
|
end
|
163
|
-
InputData.new(input)
|
189
|
+
data = InputData.new(input)
|
164
190
|
end
|
191
|
+
data.need_screenshot = @need_screenshot
|
192
|
+
data.expected_screenshot_size = @expected_screenshot_size
|
193
|
+
data
|
165
194
|
end
|
166
195
|
|
167
196
|
def create_formatter
|
@@ -18,11 +18,10 @@ module ChupaText
|
|
18
18
|
class Configuration
|
19
19
|
class << self
|
20
20
|
def default
|
21
|
-
@default ||=
|
21
|
+
@default ||= load_default
|
22
22
|
end
|
23
23
|
|
24
|
-
|
25
|
-
def create_default
|
24
|
+
def load_default
|
26
25
|
configuration = new
|
27
26
|
loader = ConfigurationLoader.new(configuration)
|
28
27
|
loader.load("chupa-text.conf")
|
data/lib/chupa-text/data.rb
CHANGED
@@ -52,6 +52,17 @@ module ChupaText
|
|
52
52
|
# archive data in {#source}.
|
53
53
|
attr_accessor :source
|
54
54
|
|
55
|
+
# @return [Screenshot, nil] The screenshot of the data. For example,
|
56
|
+
# the first page image for PDF file.text.
|
57
|
+
attr_accessor :screenshot
|
58
|
+
|
59
|
+
# @param [Bool] value `true` when screenshot is needed.
|
60
|
+
# @return [Bool] the specified value
|
61
|
+
attr_writer :need_screenshot
|
62
|
+
|
63
|
+
# @return [Array<Integer, Integer>] the expected screenshot size.
|
64
|
+
attr_accessor :expected_screenshot_size
|
65
|
+
|
55
66
|
def initialize(options={})
|
56
67
|
@uri = nil
|
57
68
|
@body = nil
|
@@ -60,9 +71,15 @@ module ChupaText
|
|
60
71
|
@mime_type = nil
|
61
72
|
@attributes = Attributes.new
|
62
73
|
@source = nil
|
74
|
+
@screenshot = nil
|
75
|
+
@need_screenshot = true
|
76
|
+
@expected_screenshot_size = [200, 200]
|
63
77
|
@options = options || {}
|
64
78
|
source_data = @options[:source_data]
|
65
|
-
|
79
|
+
if source_data
|
80
|
+
merge!(source_data)
|
81
|
+
@source = source_data
|
82
|
+
end
|
66
83
|
end
|
67
84
|
|
68
85
|
def initialize_copy(object)
|
@@ -86,6 +103,8 @@ module ChupaText
|
|
86
103
|
self["source-mime-types"] ||= []
|
87
104
|
self["source-mime-types"].unshift(data.mime_type)
|
88
105
|
end
|
106
|
+
self.need_screenshot = data.need_screenshot?
|
107
|
+
self.expected_screenshot_size = data.expected_screenshot_size
|
89
108
|
end
|
90
109
|
|
91
110
|
# @param [String, URI, nil] uri The URI for the data. If `uri` is
|
@@ -162,6 +181,11 @@ module ChupaText
|
|
162
181
|
mime_type == "text/plain"
|
163
182
|
end
|
164
183
|
|
184
|
+
# @return [Bool] `true` when screenshot is needed if available.
|
185
|
+
def need_screenshot?
|
186
|
+
@need_screenshot
|
187
|
+
end
|
188
|
+
|
165
189
|
private
|
166
190
|
def guess_mime_type
|
167
191
|
guess_mime_type_from_uri or
|
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
|
1
|
+
# Copyright (C) 2013-2017 Kouhei Sutou <kou@clear-code.com>
|
2
2
|
#
|
3
3
|
# This library is free software; you can redistribute it and/or
|
4
4
|
# modify it under the terms of the GNU Lesser General Public
|
@@ -14,6 +14,7 @@
|
|
14
14
|
# License along with this library; if not, write to the Free Software
|
15
15
|
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
16
16
|
|
17
|
+
require "cgi/util"
|
17
18
|
require "csv"
|
18
19
|
|
19
20
|
module ChupaText
|
@@ -22,8 +23,14 @@ module ChupaText
|
|
22
23
|
registry.register("csv", self)
|
23
24
|
|
24
25
|
def target?(data)
|
25
|
-
data.
|
26
|
-
|
26
|
+
return true if data.mime_type == "text/csv"
|
27
|
+
|
28
|
+
if data.text_plain? and
|
29
|
+
(data["source-mime-types"] || []).include?("text/csv")
|
30
|
+
return false
|
31
|
+
end
|
32
|
+
|
33
|
+
data.extension == "csv"
|
27
34
|
end
|
28
35
|
|
29
36
|
def decompose(data)
|
@@ -35,10 +42,42 @@ module ChupaText
|
|
35
42
|
text << "\n"
|
36
43
|
end
|
37
44
|
end
|
38
|
-
|
39
|
-
text_data
|
45
|
+
|
46
|
+
text_data = TextData.new(text, :source_data => data)
|
47
|
+
if data.need_screenshot?
|
48
|
+
text_data.screenshot = create_screenshot(data, text)
|
49
|
+
end
|
50
|
+
|
40
51
|
yield(text_data)
|
41
52
|
end
|
53
|
+
|
54
|
+
private
|
55
|
+
def create_screenshot(data, text)
|
56
|
+
width, height = data.expected_screenshot_size
|
57
|
+
max_n_lines = 10
|
58
|
+
font_size = height / max_n_lines
|
59
|
+
target_text = ""
|
60
|
+
text.each_line.with_index do |line, i|
|
61
|
+
break if i == max_n_lines
|
62
|
+
target_text << line
|
63
|
+
end
|
64
|
+
mime_type = "image/svg+xml"
|
65
|
+
data = <<-SVG
|
66
|
+
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
67
|
+
<svg
|
68
|
+
xmlns="http://www.w3.org/2000/svg"
|
69
|
+
width="#{width}"
|
70
|
+
height="#{height}"
|
71
|
+
viewBox="0 0 #{width} #{height}">
|
72
|
+
<text
|
73
|
+
x="0"
|
74
|
+
y="#{font_size}"
|
75
|
+
style="font-size: #{font_size}px; white-space: pre-wrap;"
|
76
|
+
xml:space="preserve">#{CGI.escapeHTML(target_text)}</text>
|
77
|
+
</svg>
|
78
|
+
SVG
|
79
|
+
Screenshot.new(mime_type, data)
|
80
|
+
end
|
42
81
|
end
|
43
82
|
end
|
44
83
|
end
|
@@ -42,8 +42,7 @@ module ChupaText
|
|
42
42
|
when "tgz"
|
43
43
|
uri = data.uri.to_s.gsub(/\.tgz\z/i, ".tar")
|
44
44
|
end
|
45
|
-
extracted = VirtualFileData.new(uri, reader)
|
46
|
-
extracted.source = data
|
45
|
+
extracted = VirtualFileData.new(uri, reader, :source_data => data)
|
47
46
|
yield(extracted)
|
48
47
|
end
|
49
48
|
end
|
@@ -32,8 +32,8 @@ module ChupaText
|
|
32
32
|
reader.each do |entry|
|
33
33
|
next unless entry.file?
|
34
34
|
entry.extend(CopyStreamable)
|
35
|
-
extracted = VirtualFileData.new(entry.full_name, entry
|
36
|
-
|
35
|
+
extracted = VirtualFileData.new(entry.full_name, entry,
|
36
|
+
:source_data => data)
|
37
37
|
yield(extracted)
|
38
38
|
end
|
39
39
|
end
|
@@ -28,6 +28,16 @@ module ChupaText
|
|
28
28
|
text = {}
|
29
29
|
format_headers(data, text)
|
30
30
|
text["body"] = data.body
|
31
|
+
screenshot = data.screenshot
|
32
|
+
if screenshot
|
33
|
+
text["screenshot"] = {
|
34
|
+
"mime-type" => screenshot.mime_type,
|
35
|
+
"data" => screenshot.data,
|
36
|
+
}
|
37
|
+
if screenshot.encoding
|
38
|
+
text["screenshot"]["encoding"] = screenshot.encoding
|
39
|
+
end
|
40
|
+
end
|
31
41
|
@texts << text
|
32
42
|
end
|
33
43
|
|
@@ -0,0 +1,46 @@
|
|
1
|
+
# Copyright (C) 2017 Kouhei Sutou <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# This library is free software; you can redistribute it and/or
|
4
|
+
# modify it under the terms of the GNU Lesser General Public
|
5
|
+
# License as published by the Free Software Foundation; either
|
6
|
+
# version 2.1 of the License, or (at your option) any later version.
|
7
|
+
#
|
8
|
+
# This library is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
11
|
+
# Lesser General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU Lesser General Public
|
14
|
+
# License along with this library; if not, write to the Free Software
|
15
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
16
|
+
|
17
|
+
module ChupaText
|
18
|
+
class Screenshot
|
19
|
+
# @return [String] The MIME type of the screenshot.
|
20
|
+
attr_reader :mime_type
|
21
|
+
|
22
|
+
# @return [String] The data of the screenshot.
|
23
|
+
attr_reader :data
|
24
|
+
|
25
|
+
# @return [String, nil] The encoding of the screenshot data.
|
26
|
+
# `nil` means that the data is raw data. It's used for SVG data
|
27
|
+
# because it's text data. `"base64"` means that the data is encoded
|
28
|
+
# by Base64. It's used for PNG data because it's binary data.
|
29
|
+
attr_reader :encoding
|
30
|
+
|
31
|
+
def initialize(mime_type, data, encoding=nil)
|
32
|
+
@mime_type = mime_type
|
33
|
+
@data = data
|
34
|
+
@encoding = encoding
|
35
|
+
end
|
36
|
+
|
37
|
+
def decoded_data
|
38
|
+
case @encoding
|
39
|
+
when "base64"
|
40
|
+
@data.unpack("m*")[0]
|
41
|
+
else
|
42
|
+
@data
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
data/lib/chupa-text/version.rb
CHANGED
@@ -182,4 +182,52 @@ class TestCommandChupaText < Test::Unit::TestCase
|
|
182
182
|
path.to_s))
|
183
183
|
end
|
184
184
|
end
|
185
|
+
|
186
|
+
sub_test_case("extract") do
|
187
|
+
def test_csv
|
188
|
+
fixture_name = "numbers.csv"
|
189
|
+
uri = fixture_uri(fixture_name)
|
190
|
+
path = fixture_path(fixture_name)
|
191
|
+
assert_equal([
|
192
|
+
true,
|
193
|
+
{
|
194
|
+
"uri" => uri.to_s,
|
195
|
+
"path" => path.to_s,
|
196
|
+
"mime-type" => "text/csv",
|
197
|
+
"size" => path.stat.size,
|
198
|
+
"texts" => [
|
199
|
+
{
|
200
|
+
"uri" => uri.to_s,
|
201
|
+
"path" => path.to_s,
|
202
|
+
"mime-type" => "text/plain",
|
203
|
+
"source-mime-types" => ["text/csv"],
|
204
|
+
"body" => "1 2 3\n4 5 6\n7 8 9\n",
|
205
|
+
"size" => 18,
|
206
|
+
"screenshot" => {
|
207
|
+
"mime-type" => "image/svg+xml",
|
208
|
+
"data" => <<-SVG
|
209
|
+
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
210
|
+
<svg
|
211
|
+
xmlns="http://www.w3.org/2000/svg"
|
212
|
+
width="200"
|
213
|
+
height="200"
|
214
|
+
viewBox="0 0 200 200">
|
215
|
+
<text
|
216
|
+
x="0"
|
217
|
+
y="20"
|
218
|
+
style="font-size: 20px; white-space: pre-wrap;"
|
219
|
+
xml:space="preserve">1 2 3
|
220
|
+
4 5 6
|
221
|
+
7 8 9
|
222
|
+
</text>
|
223
|
+
</svg>
|
224
|
+
SVG
|
225
|
+
},
|
226
|
+
},
|
227
|
+
],
|
228
|
+
},
|
229
|
+
],
|
230
|
+
run_command(path.to_s))
|
231
|
+
end
|
232
|
+
end
|
185
233
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: chupa-text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kouhei Sutou
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-07-
|
11
|
+
date: 2017-07-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -132,6 +132,7 @@ files:
|
|
132
132
|
- lib/chupa-text/logger.rb
|
133
133
|
- lib/chupa-text/mime-type-registry.rb
|
134
134
|
- lib/chupa-text/mime-type.rb
|
135
|
+
- lib/chupa-text/screenshot.rb
|
135
136
|
- lib/chupa-text/size-parser.rb
|
136
137
|
- lib/chupa-text/text-data.rb
|
137
138
|
- lib/chupa-text/version.rb
|
@@ -145,6 +146,7 @@ files:
|
|
145
146
|
- test/fixture/command/chupa-text/hello.txt
|
146
147
|
- test/fixture/command/chupa-text/hello.txt.gz
|
147
148
|
- test/fixture/command/chupa-text/no-decomposer.conf
|
149
|
+
- test/fixture/command/chupa-text/numbers.csv
|
148
150
|
- test/fixture/extractor/hello.txt
|
149
151
|
- test/fixture/gzip/hello.tar.gz
|
150
152
|
- test/fixture/gzip/hello.tgz
|