chupa-text 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.yardopts +5 -0
- data/Gemfile +21 -0
- data/LICENSE.txt +502 -0
- data/README.md +91 -0
- data/Rakefile +46 -0
- data/bin/chupa-text +21 -0
- data/bin/chupa-text-generate-decomposer +21 -0
- data/chupa-text.gemspec +58 -0
- data/data/chupa-text.conf +5 -0
- data/data/mime-types.conf +19 -0
- data/doc/text/command-line.md +136 -0
- data/doc/text/decomposer.md +343 -0
- data/doc/text/library.md +72 -0
- data/doc/text/news.md +5 -0
- data/lib/chupa-text.rb +37 -0
- data/lib/chupa-text/command.rb +18 -0
- data/lib/chupa-text/command/chupa-text-generate-decomposer.rb +324 -0
- data/lib/chupa-text/command/chupa-text.rb +102 -0
- data/lib/chupa-text/configuration-loader.rb +95 -0
- data/lib/chupa-text/configuration.rb +49 -0
- data/lib/chupa-text/data.rb +149 -0
- data/lib/chupa-text/decomposer-registry.rb +37 -0
- data/lib/chupa-text/decomposer.rb +37 -0
- data/lib/chupa-text/decomposers.rb +59 -0
- data/lib/chupa-text/decomposers/csv.rb +44 -0
- data/lib/chupa-text/decomposers/gzip.rb +51 -0
- data/lib/chupa-text/decomposers/tar.rb +42 -0
- data/lib/chupa-text/decomposers/xml.rb +55 -0
- data/lib/chupa-text/extractor.rb +91 -0
- data/lib/chupa-text/file-content.rb +35 -0
- data/lib/chupa-text/formatters.rb +17 -0
- data/lib/chupa-text/formatters/json.rb +60 -0
- data/lib/chupa-text/input-data.rb +58 -0
- data/lib/chupa-text/mime-type-registry.rb +41 -0
- data/lib/chupa-text/mime-type.rb +36 -0
- data/lib/chupa-text/text-data.rb +26 -0
- data/lib/chupa-text/version.rb +19 -0
- data/lib/chupa-text/virtual-content.rb +91 -0
- data/lib/chupa-text/virtual-file-data.rb +46 -0
- data/test/command/test-chupa-text.rb +178 -0
- data/test/decomposers/test-csv.rb +48 -0
- data/test/decomposers/test-gzip.rb +113 -0
- data/test/decomposers/test-tar.rb +78 -0
- data/test/decomposers/test-xml.rb +58 -0
- data/test/fixture/command/chupa-text/hello.txt +1 -0
- data/test/fixture/command/chupa-text/hello.txt.gz +0 -0
- data/test/fixture/command/chupa-text/no-decomposer.conf +3 -0
- data/test/fixture/extractor/hello.txt +1 -0
- data/test/fixture/gzip/hello.tar.gz +0 -0
- data/test/fixture/gzip/hello.tgz +0 -0
- data/test/fixture/gzip/hello.txt.gz +0 -0
- data/test/fixture/tar/directory.tar +0 -0
- data/test/fixture/tar/top-level.tar +0 -0
- data/test/helper.rb +25 -0
- data/test/run-test.rb +35 -0
- data/test/test-configuration-loader.rb +54 -0
- data/test/test-data.rb +85 -0
- data/test/test-decomposer-registry.rb +30 -0
- data/test/test-decomposer.rb +41 -0
- data/test/test-decomposers.rb +59 -0
- data/test/test-extractor.rb +125 -0
- data/test/test-file-content.rb +51 -0
- data/test/test-mime-type-registry.rb +48 -0
- data/test/test-text-data.rb +36 -0
- data/test/test-virtual-content.rb +103 -0
- metadata +183 -0
@@ -0,0 +1,102 @@
|
|
1
|
+
# Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# This library is free software; you can redistribute it and/or
|
4
|
+
# modify it under the terms of the GNU Lesser General Public
|
5
|
+
# License as published by the Free Software Foundation; either
|
6
|
+
# version 2.1 of the License, or (at your option) any later version.
|
7
|
+
#
|
8
|
+
# This library is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
11
|
+
# Lesser General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU Lesser General Public
|
14
|
+
# License along with this library; if not, write to the Free Software
|
15
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
16
|
+
|
17
|
+
require "optparse"
|
18
|
+
|
19
|
+
module ChupaText
|
20
|
+
module Command
|
21
|
+
class ChupaText
|
22
|
+
class << self
|
23
|
+
def run(*arguments)
|
24
|
+
chupa_text = new
|
25
|
+
chupa_text.run(*arguments)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def initialize
|
30
|
+
@input = nil
|
31
|
+
@configuration = Configuration.default
|
32
|
+
end
|
33
|
+
|
34
|
+
def run(*arguments)
|
35
|
+
return false unless parse_arguments(arguments)
|
36
|
+
|
37
|
+
Decomposers.load
|
38
|
+
extractor = create_extractor
|
39
|
+
data = create_data
|
40
|
+
formatter = create_formatter
|
41
|
+
formatter.format_start(data)
|
42
|
+
extractor.extract(data) do |extracted|
|
43
|
+
formatter.format_extracted(extracted)
|
44
|
+
end
|
45
|
+
formatter.format_finish(data)
|
46
|
+
true
|
47
|
+
end
|
48
|
+
|
49
|
+
private
|
50
|
+
def load_configuration(path)
|
51
|
+
loader = ConfigurationLoader.new(@configuration)
|
52
|
+
loader.load(path)
|
53
|
+
end
|
54
|
+
|
55
|
+
def parse_arguments(arguments)
|
56
|
+
parser = create_option_parser
|
57
|
+
rest = nil
|
58
|
+
begin
|
59
|
+
rest = parser.parse!(arguments)
|
60
|
+
rescue OptionParser::ParseError
|
61
|
+
puts($!.message)
|
62
|
+
return false
|
63
|
+
end
|
64
|
+
if rest.size > 1
|
65
|
+
puts(parser.help)
|
66
|
+
return false
|
67
|
+
end
|
68
|
+
@input, = rest
|
69
|
+
true
|
70
|
+
end
|
71
|
+
|
72
|
+
def create_option_parser
|
73
|
+
parser = OptionParser.new
|
74
|
+
parser.banner += " [FILE_OR_URI]"
|
75
|
+
parser.version = VERSION
|
76
|
+
parser.on("--configuration=FILE",
|
77
|
+
"Read configuration from FILE.") do |path|
|
78
|
+
load_configuration(path)
|
79
|
+
end
|
80
|
+
parser
|
81
|
+
end
|
82
|
+
|
83
|
+
def create_extractor
|
84
|
+
extractor = Extractor.new
|
85
|
+
extractor.apply_configuration(@configuration)
|
86
|
+
extractor
|
87
|
+
end
|
88
|
+
|
89
|
+
def create_data
|
90
|
+
if @input.nil?
|
91
|
+
VirtualFileData.new(nil, $stdin)
|
92
|
+
else
|
93
|
+
InputData.new(@input)
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
def create_formatter
|
98
|
+
Formatters::JSON.new($stdout)
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
@@ -0,0 +1,95 @@
|
|
1
|
+
# Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# This library is free software; you can redistribute it and/or
|
4
|
+
# modify it under the terms of the GNU Lesser General Public
|
5
|
+
# License as published by the Free Software Foundation; either
|
6
|
+
# version 2.1 of the License, or (at your option) any later version.
|
7
|
+
#
|
8
|
+
# This library is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
11
|
+
# Lesser General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU Lesser General Public
|
14
|
+
# License along with this library; if not, write to the Free Software
|
15
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
16
|
+
|
17
|
+
require "pathname"
|
18
|
+
|
19
|
+
module ChupaText
|
20
|
+
class ConfigurationLoader
|
21
|
+
attr_reader :decomposer
|
22
|
+
attr_reader :mime_type
|
23
|
+
def initialize(configuration)
|
24
|
+
@configuration = configuration
|
25
|
+
@decomposer = DecomposerLoader.new(@configuration.decomposer)
|
26
|
+
@mime_type = MIMETypeLoader.new(@configuration.mime_type_registry)
|
27
|
+
@load_paths = []
|
28
|
+
data_dir = File.join(File.dirname(__FILE__), "..", "..", "data")
|
29
|
+
@load_paths << File.expand_path(data_dir)
|
30
|
+
end
|
31
|
+
|
32
|
+
def load(path)
|
33
|
+
path = resolve_path(path)
|
34
|
+
File.open(path) do |file|
|
35
|
+
instance_eval(file.read, path, 1)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
private
|
40
|
+
def resolve_path(path)
|
41
|
+
return path if File.exist?(path)
|
42
|
+
return path if Pathname(path).absolute?
|
43
|
+
@load_paths.each do |load_path|
|
44
|
+
resolved_path = File.join(load_path, path)
|
45
|
+
return resolved_path if File.exist?(resolved_path)
|
46
|
+
end
|
47
|
+
path
|
48
|
+
end
|
49
|
+
|
50
|
+
class DecomposerLoader
|
51
|
+
def initialize(configuration)
|
52
|
+
@configuration = configuration
|
53
|
+
end
|
54
|
+
|
55
|
+
def names
|
56
|
+
@configuration.names
|
57
|
+
end
|
58
|
+
|
59
|
+
def names=(names)
|
60
|
+
@configuration.names = names
|
61
|
+
end
|
62
|
+
|
63
|
+
def [](name)
|
64
|
+
@configuration.options[name]
|
65
|
+
end
|
66
|
+
|
67
|
+
def []=(name, options)
|
68
|
+
@configuration.options[name] = options
|
69
|
+
end
|
70
|
+
|
71
|
+
def method_missing(name, *arguments)
|
72
|
+
return super if block_given?
|
73
|
+
|
74
|
+
if name.to_s.end_with?("=") and arguments.size == 1
|
75
|
+
value = arguments.first
|
76
|
+
self[name.to_s.gsub(/=\z/, "")] = value
|
77
|
+
elsif arguments.empty?
|
78
|
+
self[name.to_s]
|
79
|
+
else
|
80
|
+
super
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
class MIMETypeLoader
|
86
|
+
def initialize(registry)
|
87
|
+
@registry = registry
|
88
|
+
end
|
89
|
+
|
90
|
+
def []=(extension, mime_type)
|
91
|
+
@registry.register(extension, mime_type)
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
# Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# This library is free software; you can redistribute it and/or
|
4
|
+
# modify it under the terms of the GNU Lesser General Public
|
5
|
+
# License as published by the Free Software Foundation; either
|
6
|
+
# version 2.1 of the License, or (at your option) any later version.
|
7
|
+
#
|
8
|
+
# This library is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
11
|
+
# Lesser General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU Lesser General Public
|
14
|
+
# License along with this library; if not, write to the Free Software
|
15
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
16
|
+
|
17
|
+
module ChupaText
|
18
|
+
class Configuration
|
19
|
+
class << self
|
20
|
+
def default
|
21
|
+
@default ||= create_default
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
def create_default
|
26
|
+
configuration = new
|
27
|
+
loader = ConfigurationLoader.new(configuration)
|
28
|
+
loader.load("chupa-text.conf")
|
29
|
+
configuration
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
attr_reader :decomposer
|
34
|
+
attr_accessor :mime_type_registry
|
35
|
+
def initialize
|
36
|
+
@decomposer = DecomposerConfiguration.new
|
37
|
+
@mime_type_registry = MIMEType.registry
|
38
|
+
end
|
39
|
+
|
40
|
+
class DecomposerConfiguration
|
41
|
+
attr_accessor :names
|
42
|
+
attr_accessor :options
|
43
|
+
def initialize
|
44
|
+
@names = ["*"]
|
45
|
+
@options = {}
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,149 @@
|
|
1
|
+
# Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# This library is free software; you can redistribute it and/or
|
4
|
+
# modify it under the terms of the GNU Lesser General Public
|
5
|
+
# License as published by the Free Software Foundation; either
|
6
|
+
# version 2.1 of the License, or (at your option) any later version.
|
7
|
+
#
|
8
|
+
# This library is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
11
|
+
# Lesser General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU Lesser General Public
|
14
|
+
# License along with this library; if not, write to the Free Software
|
15
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
16
|
+
|
17
|
+
require "uri"
|
18
|
+
require "open-uri"
|
19
|
+
|
20
|
+
module ChupaText
|
21
|
+
class Data
|
22
|
+
# @return [URI, nil] The URI of the data if the data is for remote
|
23
|
+
# or local file, `nil` if the data isn't associated with any
|
24
|
+
# URIs.
|
25
|
+
attr_reader :uri
|
26
|
+
|
27
|
+
# @return [String, nil] The content of the data, `nil` if the data
|
28
|
+
# doesn't have any content.
|
29
|
+
attr_accessor :body
|
30
|
+
|
31
|
+
# @return [Integer, nil] The byte size of the data, `nil` if the data
|
32
|
+
# doesn't have any content.
|
33
|
+
attr_accessor :size
|
34
|
+
|
35
|
+
# @return [String, nil] The path associated with the content of
|
36
|
+
# the data, `nil` if the data doesn't associated with any file.
|
37
|
+
#
|
38
|
+
# The path may not be related with the original content. For
|
39
|
+
# example, `"/tmp/XXX.txt"` may be returned for the data of
|
40
|
+
# `"http://example.com/XXX.txt"`.
|
41
|
+
#
|
42
|
+
# This value is useful to use an external command to extract
|
43
|
+
# text and meta-data.
|
44
|
+
attr_accessor :path
|
45
|
+
|
46
|
+
attr_accessor :attributes
|
47
|
+
|
48
|
+
# @return [Data, nil] The source of the data. For example, text
|
49
|
+
# data (`hello.txt`) in archive data (`hello.tar`) have the
|
50
|
+
# archive data in {#source}.
|
51
|
+
attr_accessor :source
|
52
|
+
|
53
|
+
def initialize
|
54
|
+
@uri = nil
|
55
|
+
@body = nil
|
56
|
+
@size = nil
|
57
|
+
@path = nil
|
58
|
+
@mime_type = nil
|
59
|
+
@attributes = {}
|
60
|
+
@source = nil
|
61
|
+
end
|
62
|
+
|
63
|
+
def initialize_copy(object)
|
64
|
+
super
|
65
|
+
@attributes = @attributes.dup
|
66
|
+
self
|
67
|
+
end
|
68
|
+
|
69
|
+
# @param [String, URI, nil] uri The URI for the data. If `uri` is
|
70
|
+
# `nil`, it means that the data isn't associated with any URIs.
|
71
|
+
def uri=(uri)
|
72
|
+
case uri
|
73
|
+
when String, Pathname
|
74
|
+
uri = URI.parse(uri.to_s)
|
75
|
+
end
|
76
|
+
@uri = uri
|
77
|
+
end
|
78
|
+
|
79
|
+
def open
|
80
|
+
yield(StringIO.new(body))
|
81
|
+
end
|
82
|
+
|
83
|
+
def [](name)
|
84
|
+
@attributes[name]
|
85
|
+
end
|
86
|
+
|
87
|
+
def []=(name, value)
|
88
|
+
@attributes[name] = value
|
89
|
+
end
|
90
|
+
|
91
|
+
# @return [String] The MIME type of the data. If MIME type
|
92
|
+
# isn't set, guesses MIME type from path and body.
|
93
|
+
# @return [nil] If MIME type isn't set and it can't guess MIME type
|
94
|
+
# from path and body.
|
95
|
+
def mime_type
|
96
|
+
@mime_type || guess_mime_type
|
97
|
+
end
|
98
|
+
|
99
|
+
# @param [String, nil] type The MIME type of the data. You can
|
100
|
+
# unset MIME type by `nil`. If you unset MIME type, MIME type
|
101
|
+
# is guessed from path and body of the data.
|
102
|
+
def mime_type=(type)
|
103
|
+
@mime_type = type
|
104
|
+
end
|
105
|
+
|
106
|
+
# @return [String, nil] Normalized extension as String if {#uri}
|
107
|
+
# is not `nil`, `nil` otherwise. The normalized extension uses
|
108
|
+
# lower case like `pdf` not `PDF`.
|
109
|
+
def extension
|
110
|
+
return nil if @uri.nil?
|
111
|
+
File.extname(@uri.path).downcase.gsub(/\A\./, "")
|
112
|
+
end
|
113
|
+
|
114
|
+
# @return [Bool] true if MIME type is "text/XXX", false
|
115
|
+
# otherwise.
|
116
|
+
def text?
|
117
|
+
(mime_type || "").start_with?("text/")
|
118
|
+
end
|
119
|
+
|
120
|
+
private
|
121
|
+
def guess_mime_type
|
122
|
+
guess_mime_type_from_uri or
|
123
|
+
guess_mime_type_from_body
|
124
|
+
end
|
125
|
+
|
126
|
+
def guess_mime_type_from_uri
|
127
|
+
MIMEType.registry.find(extension)
|
128
|
+
end
|
129
|
+
|
130
|
+
def guess_mime_type_from_body
|
131
|
+
mime_type = nil
|
132
|
+
change_encoding(body, "UTF-8") do |utf8_body|
|
133
|
+
mime_type = "text/plain" if utf8_body.valid_encoding?
|
134
|
+
end
|
135
|
+
mime_type
|
136
|
+
end
|
137
|
+
|
138
|
+
def change_encoding(string, encoding)
|
139
|
+
return if string.nil?
|
140
|
+
begin
|
141
|
+
original_encoding = string.encoding
|
142
|
+
string.force_encoding(encoding)
|
143
|
+
yield(string)
|
144
|
+
ensure
|
145
|
+
string.force_encoding(original_encoding)
|
146
|
+
end
|
147
|
+
end
|
148
|
+
end
|
149
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
# Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# This library is free software; you can redistribute it and/or
|
4
|
+
# modify it under the terms of the GNU Lesser General Public
|
5
|
+
# License as published by the Free Software Foundation; either
|
6
|
+
# version 2.1 of the License, or (at your option) any later version.
|
7
|
+
#
|
8
|
+
# This library is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
11
|
+
# Lesser General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU Lesser General Public
|
14
|
+
# License along with this library; if not, write to the Free Software
|
15
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
16
|
+
|
17
|
+
module ChupaText
|
18
|
+
class DecomposerRegistry
|
19
|
+
include Enumerable
|
20
|
+
|
21
|
+
def initialize
|
22
|
+
@decomposer_classes = {}
|
23
|
+
end
|
24
|
+
|
25
|
+
def register(name, decomposer_class)
|
26
|
+
@decomposer_classes[name] = decomposer_class
|
27
|
+
end
|
28
|
+
|
29
|
+
def find(name)
|
30
|
+
@decomposer_classes[name]
|
31
|
+
end
|
32
|
+
|
33
|
+
def each(&block)
|
34
|
+
@decomposer_classes.each(&block)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
# Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# This library is free software; you can redistribute it and/or
|
4
|
+
# modify it under the terms of the GNU Lesser General Public
|
5
|
+
# License as published by the Free Software Foundation; either
|
6
|
+
# version 2.1 of the License, or (at your option) any later version.
|
7
|
+
#
|
8
|
+
# This library is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
11
|
+
# Lesser General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU Lesser General Public
|
14
|
+
# License along with this library; if not, write to the Free Software
|
15
|
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
16
|
+
|
17
|
+
module ChupaText
|
18
|
+
class Decomposer
|
19
|
+
class << self
|
20
|
+
def registry
|
21
|
+
@@registry ||= DecomposerRegistry.new
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def initialize(options)
|
26
|
+
@options = options
|
27
|
+
end
|
28
|
+
|
29
|
+
def target?(data)
|
30
|
+
raise NotImplementedError, "must implement #{self.class}\##{__method__}"
|
31
|
+
end
|
32
|
+
|
33
|
+
def decompose(data)
|
34
|
+
raise NotImplementedError, "must implement #{self.class}\##{__method__}"
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|