chupa-text 1.0.0 → 1.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/data/mime-types.conf +13 -11
- data/doc/text/command-line.md +2 -2
- data/doc/text/news.md +12 -0
- data/lib/chupa-text/command/chupa-text.rb +15 -1
- data/lib/chupa-text/configuration-loader.rb +3 -3
- data/lib/chupa-text/data.rb +6 -0
- data/lib/chupa-text/decomposers.rb +14 -0
- data/lib/chupa-text/extractor.rb +7 -6
- data/lib/chupa-text/version.rb +1 -1
- data/test/command/test-chupa-text.rb +1 -1
- data/test/test-data.rb +34 -0
- data/test/test-extractor.rb +3 -3
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c04261f7c3b4c26a0d67f18f473c8adf1d435080
|
4
|
+
data.tar.gz: 4e1a8da97abfa817226f919fcb672fb55d570d4d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 00ada22d44d1f41ca4e2f08f02c43b41b193453ee33477dd02e9f5a9fbc0d87f5de46c53bc36a77b1494c2f8729a01e72419f9c9c9738d8819c04dde06c50d44
|
7
|
+
data.tar.gz: 7d404268095b308d4b7158e70458ac31d2dade5a0a131431559e84b34b563fb94454d5a3e900684dfe84da2642b382a14c55a479192992b141136a5f8659faf9
|
data/data/mime-types.conf
CHANGED
@@ -1,19 +1,21 @@
|
|
1
1
|
# -*- ruby -*-
|
2
2
|
|
3
|
-
|
3
|
+
mime_types["txt"] = "text/plain"
|
4
4
|
|
5
|
-
|
6
|
-
|
5
|
+
mime_types["gz"] = "application/x-gzip"
|
6
|
+
mime_types["tgz"] = "application/x-gtar-compressed"
|
7
7
|
|
8
|
-
|
8
|
+
mime_types["tar"] = "application/x-tar"
|
9
9
|
|
10
|
-
|
11
|
-
|
12
|
-
|
10
|
+
mime_types["htm"] = "text/html"
|
11
|
+
mime_types["html"] = "text/html"
|
12
|
+
mime_types["xhtml"] = "application/xhtml+xml"
|
13
13
|
|
14
|
-
|
14
|
+
mime_types["xml"] = "text/xml"
|
15
15
|
|
16
|
-
|
16
|
+
mime_types["css"] = "text/css"
|
17
17
|
|
18
|
-
|
19
|
-
|
18
|
+
mime_types["csv"] = "text/csv"
|
19
|
+
mime_types["tsv"] = "text/tab-separated-values"
|
20
|
+
|
21
|
+
mime_types["pdf"] = "application/pdf"
|
data/doc/text/command-line.md
CHANGED
@@ -123,14 +123,14 @@ command. You can use glob pattern for decomposer name such as
|
|
123
123
|
The default is `["*"]`. It means that all installed decomposers are
|
124
124
|
used.
|
125
125
|
|
126
|
-
`
|
126
|
+
`mime_types["<extension>"] = "<MIME type>"`
|
127
127
|
|
128
128
|
It specifies a map to a MIME type from path extension.
|
129
129
|
|
130
130
|
Here is an example that maps `"html"` to `"text/html"`:
|
131
131
|
|
132
132
|
```
|
133
|
-
|
133
|
+
mime_types["html"] = "text/html"
|
134
134
|
```
|
135
135
|
|
136
136
|
Th default configuration file registers popular MIME types.
|
data/doc/text/news.md
CHANGED
@@ -1,5 +1,17 @@
|
|
1
1
|
# News
|
2
2
|
|
3
|
+
## 1.0.1: 2014-01-05
|
4
|
+
|
5
|
+
* chupa-text: Supported loading decomposers installed by RubyGems.
|
6
|
+
* chupa-text: Added `--disable-gems` option that disable loading
|
7
|
+
decomposers installed by RubyGems.
|
8
|
+
* chupa-text: Added `-I` option to use decomposers that are not
|
9
|
+
installed by RubyGems.
|
10
|
+
* Added {ChupaText::Data#text_plain?}.
|
11
|
+
* configuration: Changed `mime_types` from `mime_type` because they
|
12
|
+
processes about a set of MIME types.
|
13
|
+
* configuration: Added PDF to the default MIME type mappings.
|
14
|
+
|
3
15
|
## 1.0.0: 2014-01-05
|
4
16
|
|
5
17
|
The first release!!!
|
@@ -29,12 +29,13 @@ module ChupaText
|
|
29
29
|
def initialize
|
30
30
|
@input = nil
|
31
31
|
@configuration = Configuration.default
|
32
|
+
@enable_gems = true
|
32
33
|
end
|
33
34
|
|
34
35
|
def run(*arguments)
|
35
36
|
return false unless parse_arguments(arguments)
|
36
37
|
|
37
|
-
|
38
|
+
load_decomposers
|
38
39
|
extractor = create_extractor
|
39
40
|
data = create_data
|
40
41
|
formatter = create_formatter
|
@@ -77,9 +78,22 @@ module ChupaText
|
|
77
78
|
"Read configuration from FILE.") do |path|
|
78
79
|
load_configuration(path)
|
79
80
|
end
|
81
|
+
parser.on("--disable-gems",
|
82
|
+
"Disable decomposers installed by RubyGems.") do
|
83
|
+
@enable_gems = false
|
84
|
+
end
|
85
|
+
parser.on("-I=PATH",
|
86
|
+
"Append PATH to decomposer load path.") do |path|
|
87
|
+
$LOAD_PATH << path
|
88
|
+
end
|
80
89
|
parser
|
81
90
|
end
|
82
91
|
|
92
|
+
def load_decomposers
|
93
|
+
Decomposers.enable_all_gems if @enable_gems
|
94
|
+
Decomposers.load
|
95
|
+
end
|
96
|
+
|
83
97
|
def create_extractor
|
84
98
|
extractor = Extractor.new
|
85
99
|
extractor.apply_configuration(@configuration)
|
@@ -19,11 +19,11 @@ require "pathname"
|
|
19
19
|
module ChupaText
|
20
20
|
class ConfigurationLoader
|
21
21
|
attr_reader :decomposer
|
22
|
-
attr_reader :
|
22
|
+
attr_reader :mime_types
|
23
23
|
def initialize(configuration)
|
24
24
|
@configuration = configuration
|
25
25
|
@decomposer = DecomposerLoader.new(@configuration.decomposer)
|
26
|
-
@
|
26
|
+
@mime_types = MIMETypesLoader.new(@configuration.mime_type_registry)
|
27
27
|
@load_paths = []
|
28
28
|
data_dir = File.join(File.dirname(__FILE__), "..", "..", "data")
|
29
29
|
@load_paths << File.expand_path(data_dir)
|
@@ -82,7 +82,7 @@ module ChupaText
|
|
82
82
|
end
|
83
83
|
end
|
84
84
|
|
85
|
-
class
|
85
|
+
class MIMETypesLoader
|
86
86
|
def initialize(registry)
|
87
87
|
@registry = registry
|
88
88
|
end
|
data/lib/chupa-text/data.rb
CHANGED
@@ -117,6 +117,12 @@ module ChupaText
|
|
117
117
|
(mime_type || "").start_with?("text/")
|
118
118
|
end
|
119
119
|
|
120
|
+
# @return [Bool] true if MIME type is "text/plain", false
|
121
|
+
# otherwise.
|
122
|
+
def text_plain?
|
123
|
+
mime_type == "text/plain"
|
124
|
+
end
|
125
|
+
|
120
126
|
private
|
121
127
|
def guess_mime_type
|
122
128
|
guess_mime_type_from_uri or
|
@@ -17,6 +17,20 @@
|
|
17
17
|
module ChupaText
|
18
18
|
module Decomposers
|
19
19
|
class << self
|
20
|
+
def enable_all_gems
|
21
|
+
decomposer_specs = Gem::Specification.find_all do |spec|
|
22
|
+
spec.name.start_with?("chupa-text-decomposer-")
|
23
|
+
end
|
24
|
+
grouped_decomposer_specs = decomposer_specs.group_by(&:name)
|
25
|
+
latest_decomposer_specs = []
|
26
|
+
grouped_decomposer_specs.each do |name, specs|
|
27
|
+
latest_decomposer_specs << specs.sort_by(&:version).last
|
28
|
+
end
|
29
|
+
latest_decomposer_specs.each do |spec|
|
30
|
+
gem(spec.name, spec.version)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
20
34
|
def load
|
21
35
|
paths = []
|
22
36
|
$LOAD_PATH.each do |load_path|
|
data/lib/chupa-text/extractor.rb
CHANGED
@@ -59,6 +59,10 @@ module ChupaText
|
|
59
59
|
targets = [ensure_data(input)]
|
60
60
|
until targets.empty?
|
61
61
|
target = targets.pop
|
62
|
+
if target.text_plain?
|
63
|
+
yield(target)
|
64
|
+
next
|
65
|
+
end
|
62
66
|
decomposer = find_decomposer(target)
|
63
67
|
if decomposer.nil?
|
64
68
|
yield(target) if target.text?
|
@@ -72,13 +76,10 @@ module ChupaText
|
|
72
76
|
|
73
77
|
private
|
74
78
|
def ensure_data(input)
|
75
|
-
|
76
|
-
when String, Pathname, URI::Generic
|
77
|
-
data = Data.new
|
78
|
-
data.uri = input.to_s
|
79
|
-
data
|
80
|
-
else
|
79
|
+
if input.is_a?(Data)
|
81
80
|
input
|
81
|
+
else
|
82
|
+
InputData.new(input)
|
82
83
|
end
|
83
84
|
end
|
84
85
|
|
data/lib/chupa-text/version.rb
CHANGED
@@ -44,7 +44,7 @@ class TestCommandChupaText < Test::Unit::TestCase
|
|
44
44
|
|
45
45
|
def run_command(*arguments)
|
46
46
|
succeeded = wrap_io do
|
47
|
-
ChupaText::Command::ChupaText.run(*arguments)
|
47
|
+
ChupaText::Command::ChupaText.run("--disable-gems", *arguments)
|
48
48
|
end
|
49
49
|
[succeeded, JSON.parse(@stdout.string)]
|
50
50
|
end
|
data/test/test-data.rb
CHANGED
@@ -56,6 +56,40 @@ class TestData < Test::Unit::TestCase
|
|
56
56
|
end
|
57
57
|
end
|
58
58
|
end
|
59
|
+
|
60
|
+
sub_test_case("text?") do
|
61
|
+
def test_text_plain
|
62
|
+
@data.mime_type = "text/plain"
|
63
|
+
assert_true(@data.text?)
|
64
|
+
end
|
65
|
+
|
66
|
+
def test_text_html
|
67
|
+
@data.mime_type = "text/html"
|
68
|
+
assert_true(@data.text?)
|
69
|
+
end
|
70
|
+
|
71
|
+
def test_application_xhtml_xml
|
72
|
+
@data.mime_type = "application/xhtml+xml"
|
73
|
+
assert_false(@data.text?)
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
sub_test_case("text_plain?") do
|
78
|
+
def test_text_plain
|
79
|
+
@data.mime_type = "text/plain"
|
80
|
+
assert_true(@data.text_plain?)
|
81
|
+
end
|
82
|
+
|
83
|
+
def test_text_html
|
84
|
+
@data.mime_type = "text/html"
|
85
|
+
assert_false(@data.text_plain?)
|
86
|
+
end
|
87
|
+
|
88
|
+
def test_application_xhtml_xml
|
89
|
+
@data.mime_type = "application/xhtml+xml"
|
90
|
+
assert_false(@data.text_plain?)
|
91
|
+
end
|
92
|
+
end
|
59
93
|
end
|
60
94
|
|
61
95
|
sub_test_case("extension") do
|
data/test/test-extractor.rb
CHANGED
@@ -97,12 +97,12 @@ class TestExtractor < Test::Unit::TestCase
|
|
97
97
|
sub_test_case("multi decomposed") do
|
98
98
|
class CopyDecomposer < ChupaText::Decomposer
|
99
99
|
def target?(data)
|
100
|
-
data
|
100
|
+
data.mime_type == "text/x-plain"
|
101
101
|
end
|
102
102
|
|
103
103
|
def decompose(data)
|
104
104
|
copied_data = data.dup
|
105
|
-
copied_data
|
105
|
+
copied_data.mime_type = "text/plain"
|
106
106
|
yield(copied_data.dup)
|
107
107
|
yield(copied_data.dup)
|
108
108
|
end
|
@@ -116,7 +116,7 @@ class TestExtractor < Test::Unit::TestCase
|
|
116
116
|
|
117
117
|
def test_decompose
|
118
118
|
data = ChupaText::Data.new
|
119
|
-
data.mime_type = "text/plain"
|
119
|
+
data.mime_type = "text/x-plain"
|
120
120
|
data.body = "Hello"
|
121
121
|
assert_equal(["Hello", "Hello"], extract(data))
|
122
122
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: chupa-text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kouhei Sutou
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-01-
|
11
|
+
date: 2014-01-05 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|