chupa-text 1.0.0 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/data/mime-types.conf +13 -11
- data/doc/text/command-line.md +2 -2
- data/doc/text/news.md +12 -0
- data/lib/chupa-text/command/chupa-text.rb +15 -1
- data/lib/chupa-text/configuration-loader.rb +3 -3
- data/lib/chupa-text/data.rb +6 -0
- data/lib/chupa-text/decomposers.rb +14 -0
- data/lib/chupa-text/extractor.rb +7 -6
- data/lib/chupa-text/version.rb +1 -1
- data/test/command/test-chupa-text.rb +1 -1
- data/test/test-data.rb +34 -0
- data/test/test-extractor.rb +3 -3
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c04261f7c3b4c26a0d67f18f473c8adf1d435080
|
4
|
+
data.tar.gz: 4e1a8da97abfa817226f919fcb672fb55d570d4d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 00ada22d44d1f41ca4e2f08f02c43b41b193453ee33477dd02e9f5a9fbc0d87f5de46c53bc36a77b1494c2f8729a01e72419f9c9c9738d8819c04dde06c50d44
|
7
|
+
data.tar.gz: 7d404268095b308d4b7158e70458ac31d2dade5a0a131431559e84b34b563fb94454d5a3e900684dfe84da2642b382a14c55a479192992b141136a5f8659faf9
|
data/data/mime-types.conf
CHANGED
@@ -1,19 +1,21 @@
|
|
1
1
|
# -*- ruby -*-
|
2
2
|
|
3
|
-
|
3
|
+
mime_types["txt"] = "text/plain"
|
4
4
|
|
5
|
-
|
6
|
-
|
5
|
+
mime_types["gz"] = "application/x-gzip"
|
6
|
+
mime_types["tgz"] = "application/x-gtar-compressed"
|
7
7
|
|
8
|
-
|
8
|
+
mime_types["tar"] = "application/x-tar"
|
9
9
|
|
10
|
-
|
11
|
-
|
12
|
-
|
10
|
+
mime_types["htm"] = "text/html"
|
11
|
+
mime_types["html"] = "text/html"
|
12
|
+
mime_types["xhtml"] = "application/xhtml+xml"
|
13
13
|
|
14
|
-
|
14
|
+
mime_types["xml"] = "text/xml"
|
15
15
|
|
16
|
-
|
16
|
+
mime_types["css"] = "text/css"
|
17
17
|
|
18
|
-
|
19
|
-
|
18
|
+
mime_types["csv"] = "text/csv"
|
19
|
+
mime_types["tsv"] = "text/tab-separated-values"
|
20
|
+
|
21
|
+
mime_types["pdf"] = "application/pdf"
|
data/doc/text/command-line.md
CHANGED
@@ -123,14 +123,14 @@ command. You can use glob pattern for decomposer name such as
|
|
123
123
|
The default is `["*"]`. It means that all installed decomposers are
|
124
124
|
used.
|
125
125
|
|
126
|
-
`
|
126
|
+
`mime_types["<extension>"] = "<MIME type>"`
|
127
127
|
|
128
128
|
It specifies a map to a MIME type from path extension.
|
129
129
|
|
130
130
|
Here is an example that maps `"html"` to `"text/html"`:
|
131
131
|
|
132
132
|
```
|
133
|
-
|
133
|
+
mime_types["html"] = "text/html"
|
134
134
|
```
|
135
135
|
|
136
136
|
Th default configuration file registers popular MIME types.
|
data/doc/text/news.md
CHANGED
@@ -1,5 +1,17 @@
|
|
1
1
|
# News
|
2
2
|
|
3
|
+
## 1.0.1: 2014-01-05
|
4
|
+
|
5
|
+
* chupa-text: Supported loading decomposers installed by RubyGems.
|
6
|
+
* chupa-text: Added `--disable-gems` option that disable loading
|
7
|
+
decomposers installed by RubyGems.
|
8
|
+
* chupa-text: Added `-I` option to use decomposers that are not
|
9
|
+
installed by RubyGems.
|
10
|
+
* Added {ChupaText::Data#text_plain?}.
|
11
|
+
* configuration: Changed `mime_types` from `mime_type` because they
|
12
|
+
processes about a set of MIME types.
|
13
|
+
* configuration: Added PDF to the default MIME type mappings.
|
14
|
+
|
3
15
|
## 1.0.0: 2014-01-05
|
4
16
|
|
5
17
|
The first release!!!
|
@@ -29,12 +29,13 @@ module ChupaText
|
|
29
29
|
def initialize
|
30
30
|
@input = nil
|
31
31
|
@configuration = Configuration.default
|
32
|
+
@enable_gems = true
|
32
33
|
end
|
33
34
|
|
34
35
|
def run(*arguments)
|
35
36
|
return false unless parse_arguments(arguments)
|
36
37
|
|
37
|
-
|
38
|
+
load_decomposers
|
38
39
|
extractor = create_extractor
|
39
40
|
data = create_data
|
40
41
|
formatter = create_formatter
|
@@ -77,9 +78,22 @@ module ChupaText
|
|
77
78
|
"Read configuration from FILE.") do |path|
|
78
79
|
load_configuration(path)
|
79
80
|
end
|
81
|
+
parser.on("--disable-gems",
|
82
|
+
"Disable decomposers installed by RubyGems.") do
|
83
|
+
@enable_gems = false
|
84
|
+
end
|
85
|
+
parser.on("-I=PATH",
|
86
|
+
"Append PATH to decomposer load path.") do |path|
|
87
|
+
$LOAD_PATH << path
|
88
|
+
end
|
80
89
|
parser
|
81
90
|
end
|
82
91
|
|
92
|
+
def load_decomposers
|
93
|
+
Decomposers.enable_all_gems if @enable_gems
|
94
|
+
Decomposers.load
|
95
|
+
end
|
96
|
+
|
83
97
|
def create_extractor
|
84
98
|
extractor = Extractor.new
|
85
99
|
extractor.apply_configuration(@configuration)
|
@@ -19,11 +19,11 @@ require "pathname"
|
|
19
19
|
module ChupaText
|
20
20
|
class ConfigurationLoader
|
21
21
|
attr_reader :decomposer
|
22
|
-
attr_reader :
|
22
|
+
attr_reader :mime_types
|
23
23
|
def initialize(configuration)
|
24
24
|
@configuration = configuration
|
25
25
|
@decomposer = DecomposerLoader.new(@configuration.decomposer)
|
26
|
-
@
|
26
|
+
@mime_types = MIMETypesLoader.new(@configuration.mime_type_registry)
|
27
27
|
@load_paths = []
|
28
28
|
data_dir = File.join(File.dirname(__FILE__), "..", "..", "data")
|
29
29
|
@load_paths << File.expand_path(data_dir)
|
@@ -82,7 +82,7 @@ module ChupaText
|
|
82
82
|
end
|
83
83
|
end
|
84
84
|
|
85
|
-
class
|
85
|
+
class MIMETypesLoader
|
86
86
|
def initialize(registry)
|
87
87
|
@registry = registry
|
88
88
|
end
|
data/lib/chupa-text/data.rb
CHANGED
@@ -117,6 +117,12 @@ module ChupaText
|
|
117
117
|
(mime_type || "").start_with?("text/")
|
118
118
|
end
|
119
119
|
|
120
|
+
# @return [Bool] true if MIME type is "text/plain", false
|
121
|
+
# otherwise.
|
122
|
+
def text_plain?
|
123
|
+
mime_type == "text/plain"
|
124
|
+
end
|
125
|
+
|
120
126
|
private
|
121
127
|
def guess_mime_type
|
122
128
|
guess_mime_type_from_uri or
|
@@ -17,6 +17,20 @@
|
|
17
17
|
module ChupaText
|
18
18
|
module Decomposers
|
19
19
|
class << self
|
20
|
+
def enable_all_gems
|
21
|
+
decomposer_specs = Gem::Specification.find_all do |spec|
|
22
|
+
spec.name.start_with?("chupa-text-decomposer-")
|
23
|
+
end
|
24
|
+
grouped_decomposer_specs = decomposer_specs.group_by(&:name)
|
25
|
+
latest_decomposer_specs = []
|
26
|
+
grouped_decomposer_specs.each do |name, specs|
|
27
|
+
latest_decomposer_specs << specs.sort_by(&:version).last
|
28
|
+
end
|
29
|
+
latest_decomposer_specs.each do |spec|
|
30
|
+
gem(spec.name, spec.version)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
20
34
|
def load
|
21
35
|
paths = []
|
22
36
|
$LOAD_PATH.each do |load_path|
|
data/lib/chupa-text/extractor.rb
CHANGED
@@ -59,6 +59,10 @@ module ChupaText
|
|
59
59
|
targets = [ensure_data(input)]
|
60
60
|
until targets.empty?
|
61
61
|
target = targets.pop
|
62
|
+
if target.text_plain?
|
63
|
+
yield(target)
|
64
|
+
next
|
65
|
+
end
|
62
66
|
decomposer = find_decomposer(target)
|
63
67
|
if decomposer.nil?
|
64
68
|
yield(target) if target.text?
|
@@ -72,13 +76,10 @@ module ChupaText
|
|
72
76
|
|
73
77
|
private
|
74
78
|
def ensure_data(input)
|
75
|
-
|
76
|
-
when String, Pathname, URI::Generic
|
77
|
-
data = Data.new
|
78
|
-
data.uri = input.to_s
|
79
|
-
data
|
80
|
-
else
|
79
|
+
if input.is_a?(Data)
|
81
80
|
input
|
81
|
+
else
|
82
|
+
InputData.new(input)
|
82
83
|
end
|
83
84
|
end
|
84
85
|
|
data/lib/chupa-text/version.rb
CHANGED
@@ -44,7 +44,7 @@ class TestCommandChupaText < Test::Unit::TestCase
|
|
44
44
|
|
45
45
|
def run_command(*arguments)
|
46
46
|
succeeded = wrap_io do
|
47
|
-
ChupaText::Command::ChupaText.run(*arguments)
|
47
|
+
ChupaText::Command::ChupaText.run("--disable-gems", *arguments)
|
48
48
|
end
|
49
49
|
[succeeded, JSON.parse(@stdout.string)]
|
50
50
|
end
|
data/test/test-data.rb
CHANGED
@@ -56,6 +56,40 @@ class TestData < Test::Unit::TestCase
|
|
56
56
|
end
|
57
57
|
end
|
58
58
|
end
|
59
|
+
|
60
|
+
sub_test_case("text?") do
|
61
|
+
def test_text_plain
|
62
|
+
@data.mime_type = "text/plain"
|
63
|
+
assert_true(@data.text?)
|
64
|
+
end
|
65
|
+
|
66
|
+
def test_text_html
|
67
|
+
@data.mime_type = "text/html"
|
68
|
+
assert_true(@data.text?)
|
69
|
+
end
|
70
|
+
|
71
|
+
def test_application_xhtml_xml
|
72
|
+
@data.mime_type = "application/xhtml+xml"
|
73
|
+
assert_false(@data.text?)
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
sub_test_case("text_plain?") do
|
78
|
+
def test_text_plain
|
79
|
+
@data.mime_type = "text/plain"
|
80
|
+
assert_true(@data.text_plain?)
|
81
|
+
end
|
82
|
+
|
83
|
+
def test_text_html
|
84
|
+
@data.mime_type = "text/html"
|
85
|
+
assert_false(@data.text_plain?)
|
86
|
+
end
|
87
|
+
|
88
|
+
def test_application_xhtml_xml
|
89
|
+
@data.mime_type = "application/xhtml+xml"
|
90
|
+
assert_false(@data.text_plain?)
|
91
|
+
end
|
92
|
+
end
|
59
93
|
end
|
60
94
|
|
61
95
|
sub_test_case("extension") do
|
data/test/test-extractor.rb
CHANGED
@@ -97,12 +97,12 @@ class TestExtractor < Test::Unit::TestCase
|
|
97
97
|
sub_test_case("multi decomposed") do
|
98
98
|
class CopyDecomposer < ChupaText::Decomposer
|
99
99
|
def target?(data)
|
100
|
-
data
|
100
|
+
data.mime_type == "text/x-plain"
|
101
101
|
end
|
102
102
|
|
103
103
|
def decompose(data)
|
104
104
|
copied_data = data.dup
|
105
|
-
copied_data
|
105
|
+
copied_data.mime_type = "text/plain"
|
106
106
|
yield(copied_data.dup)
|
107
107
|
yield(copied_data.dup)
|
108
108
|
end
|
@@ -116,7 +116,7 @@ class TestExtractor < Test::Unit::TestCase
|
|
116
116
|
|
117
117
|
def test_decompose
|
118
118
|
data = ChupaText::Data.new
|
119
|
-
data.mime_type = "text/plain"
|
119
|
+
data.mime_type = "text/x-plain"
|
120
120
|
data.body = "Hello"
|
121
121
|
assert_equal(["Hello", "Hello"], extract(data))
|
122
122
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: chupa-text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kouhei Sutou
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-01-
|
11
|
+
date: 2014-01-05 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|