chupa-text 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +5 -0
  3. data/Gemfile +21 -0
  4. data/LICENSE.txt +502 -0
  5. data/README.md +91 -0
  6. data/Rakefile +46 -0
  7. data/bin/chupa-text +21 -0
  8. data/bin/chupa-text-generate-decomposer +21 -0
  9. data/chupa-text.gemspec +58 -0
  10. data/data/chupa-text.conf +5 -0
  11. data/data/mime-types.conf +19 -0
  12. data/doc/text/command-line.md +136 -0
  13. data/doc/text/decomposer.md +343 -0
  14. data/doc/text/library.md +72 -0
  15. data/doc/text/news.md +5 -0
  16. data/lib/chupa-text.rb +37 -0
  17. data/lib/chupa-text/command.rb +18 -0
  18. data/lib/chupa-text/command/chupa-text-generate-decomposer.rb +324 -0
  19. data/lib/chupa-text/command/chupa-text.rb +102 -0
  20. data/lib/chupa-text/configuration-loader.rb +95 -0
  21. data/lib/chupa-text/configuration.rb +49 -0
  22. data/lib/chupa-text/data.rb +149 -0
  23. data/lib/chupa-text/decomposer-registry.rb +37 -0
  24. data/lib/chupa-text/decomposer.rb +37 -0
  25. data/lib/chupa-text/decomposers.rb +59 -0
  26. data/lib/chupa-text/decomposers/csv.rb +44 -0
  27. data/lib/chupa-text/decomposers/gzip.rb +51 -0
  28. data/lib/chupa-text/decomposers/tar.rb +42 -0
  29. data/lib/chupa-text/decomposers/xml.rb +55 -0
  30. data/lib/chupa-text/extractor.rb +91 -0
  31. data/lib/chupa-text/file-content.rb +35 -0
  32. data/lib/chupa-text/formatters.rb +17 -0
  33. data/lib/chupa-text/formatters/json.rb +60 -0
  34. data/lib/chupa-text/input-data.rb +58 -0
  35. data/lib/chupa-text/mime-type-registry.rb +41 -0
  36. data/lib/chupa-text/mime-type.rb +36 -0
  37. data/lib/chupa-text/text-data.rb +26 -0
  38. data/lib/chupa-text/version.rb +19 -0
  39. data/lib/chupa-text/virtual-content.rb +91 -0
  40. data/lib/chupa-text/virtual-file-data.rb +46 -0
  41. data/test/command/test-chupa-text.rb +178 -0
  42. data/test/decomposers/test-csv.rb +48 -0
  43. data/test/decomposers/test-gzip.rb +113 -0
  44. data/test/decomposers/test-tar.rb +78 -0
  45. data/test/decomposers/test-xml.rb +58 -0
  46. data/test/fixture/command/chupa-text/hello.txt +1 -0
  47. data/test/fixture/command/chupa-text/hello.txt.gz +0 -0
  48. data/test/fixture/command/chupa-text/no-decomposer.conf +3 -0
  49. data/test/fixture/extractor/hello.txt +1 -0
  50. data/test/fixture/gzip/hello.tar.gz +0 -0
  51. data/test/fixture/gzip/hello.tgz +0 -0
  52. data/test/fixture/gzip/hello.txt.gz +0 -0
  53. data/test/fixture/tar/directory.tar +0 -0
  54. data/test/fixture/tar/top-level.tar +0 -0
  55. data/test/helper.rb +25 -0
  56. data/test/run-test.rb +35 -0
  57. data/test/test-configuration-loader.rb +54 -0
  58. data/test/test-data.rb +85 -0
  59. data/test/test-decomposer-registry.rb +30 -0
  60. data/test/test-decomposer.rb +41 -0
  61. data/test/test-decomposers.rb +59 -0
  62. data/test/test-extractor.rb +125 -0
  63. data/test/test-file-content.rb +51 -0
  64. data/test/test-mime-type-registry.rb +48 -0
  65. data/test/test-text-data.rb +36 -0
  66. data/test/test-virtual-content.rb +103 -0
  67. metadata +183 -0
@@ -0,0 +1,30 @@
1
+ # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ class TestDecomposerRegistry < Test::Unit::TestCase
18
+ class CSVDecomposer < ChupaText::Decomposer
19
+ end
20
+
21
+ def setup
22
+ @registry = ChupaText::DecomposerRegistry.new
23
+ end
24
+
25
+ def test_register
26
+ assert_equal([], @registry.to_a)
27
+ @registry.register("csv", CSVDecomposer)
28
+ assert_equal([["csv", CSVDecomposer]], @registry.to_a)
29
+ end
30
+ end
@@ -0,0 +1,41 @@
1
+ # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ class TestDecomposer < Test::Unit::TestCase
18
+ sub_test_case("not-implemented") do
19
+ class NotImplementedDecomposer < ChupaText::Decomposer
20
+ end
21
+
22
+ def setup
23
+ @decomposer = NotImplementedDecomposer.new({})
24
+ @data = ChupaText::Data.new
25
+ end
26
+
27
+ def test_target?
28
+ message = "must implement #{NotImplementedDecomposer}\#target?"
29
+ assert_raise(NotImplementedError.new(message)) do
30
+ @decomposer.target?(@data)
31
+ end
32
+ end
33
+
34
+ def test_decompose
35
+ message = "must implement #{NotImplementedDecomposer}\#decompose"
36
+ assert_raise(NotImplementedError.new(message)) do
37
+ @decomposer.decompose(@data)
38
+ end
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,59 @@
1
+ # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ class TestDecomposers < Test::Unit::TestCase
18
+ class CSVDecomposer < ChupaText::Decomposer
19
+ end
20
+
21
+ def setup
22
+ @registry = ChupaText::DecomposerRegistry.new
23
+ @registry.register("csv", CSVDecomposer)
24
+ @configuration = ChupaText::Configuration.new
25
+ end
26
+
27
+ sub_test_case("create") do
28
+ def test_default
29
+ decomposers = create
30
+ assert_equal([CSVDecomposer], decomposers.collect(&:class))
31
+ end
32
+
33
+ def test_no_match
34
+ @configuration.decomposer.names = []
35
+ decomposers = create
36
+ assert_equal([], decomposers.collect(&:class))
37
+ end
38
+
39
+ def test_glob
40
+ @configuration.decomposer.names = ["*sv"]
41
+ decomposers = create
42
+ assert_equal([CSVDecomposer], decomposers.collect(&:class))
43
+ end
44
+
45
+ def test_ext_glob
46
+ unless File.const_defined?(:FNM_EXTGLOB)
47
+ omit("File::FNM_EXTGLOB is required")
48
+ end
49
+ @configuration.decomposer.names = ["{a,b,c}sv"]
50
+ decomposers = create
51
+ assert_equal([CSVDecomposer], decomposers.collect(&:class))
52
+ end
53
+
54
+ private
55
+ def create
56
+ ChupaText::Decomposers.create(@registry, @configuration.decomposer)
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,125 @@
1
+ # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ class TestExtractor < Test::Unit::TestCase
18
+ include Helper
19
+
20
+ def setup
21
+ @extractor = ChupaText::Extractor.new
22
+ end
23
+
24
+ private
25
+ def fixture_path(*components)
26
+ super("extractor", *components)
27
+ end
28
+
29
+ sub_test_case("extract") do
30
+ private
31
+ def extract(data)
32
+ texts = []
33
+ @extractor.extract(data) do |extracted_data|
34
+ texts << extracted_data.body
35
+ end
36
+ texts
37
+ end
38
+
39
+ sub_test_case("input") do
40
+ def test_string
41
+ extract(fixture_path("hello.txt").to_s)
42
+ end
43
+
44
+ def test_uri
45
+ extract(URI.parse(fixture_path("hello.txt").to_s))
46
+ end
47
+
48
+ def test_path
49
+ extract(fixture_path("hello.txt"))
50
+ end
51
+ end
52
+
53
+ sub_test_case("no decomposers") do
54
+ def test_text
55
+ data = ChupaText::Data.new
56
+ data.mime_type = "text/plain"
57
+ data.body = "Hello"
58
+ assert_equal(["Hello"], extract(data))
59
+ end
60
+
61
+ def test_not_text
62
+ data = ChupaText::Data.new
63
+ data.mime_type = "application/x-javascript"
64
+ data.body = "alert('Hello');"
65
+ assert_equal([], extract(data))
66
+ end
67
+ end
68
+
69
+ sub_test_case("use decomposer") do
70
+ class HTMLDecomposer < ChupaText::Decomposer
71
+ def target?(data)
72
+ data.mime_type == "text/html"
73
+ end
74
+
75
+ def decompose(data)
76
+ extracted = ChupaText::Data.new
77
+ extracted.mime_type = "text/plain"
78
+ extracted.body = data.body.gsub(/<.+?>/, "")
79
+ yield(extracted)
80
+ end
81
+ end
82
+
83
+ def setup
84
+ super
85
+ decomposer = HTMLDecomposer.new({})
86
+ @extractor.add_decomposer(decomposer)
87
+ end
88
+
89
+ def test_decompose
90
+ data = ChupaText::Data.new
91
+ data.mime_type = "text/html"
92
+ data.body = "<html><body>Hello</body></html>"
93
+ assert_equal(["Hello"], extract(data))
94
+ end
95
+ end
96
+
97
+ sub_test_case("multi decomposed") do
98
+ class CopyDecomposer < ChupaText::Decomposer
99
+ def target?(data)
100
+ data["copied"].nil?
101
+ end
102
+
103
+ def decompose(data)
104
+ copied_data = data.dup
105
+ copied_data["copied"] = true
106
+ yield(copied_data.dup)
107
+ yield(copied_data.dup)
108
+ end
109
+ end
110
+
111
+ def setup
112
+ super
113
+ decomposer = CopyDecomposer.new({})
114
+ @extractor.add_decomposer(decomposer)
115
+ end
116
+
117
+ def test_decompose
118
+ data = ChupaText::Data.new
119
+ data.mime_type = "text/plain"
120
+ data.body = "Hello"
121
+ assert_equal(["Hello", "Hello"], extract(data))
122
+ end
123
+ end
124
+ end
125
+ end
@@ -0,0 +1,51 @@
1
+ # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ class TestFileContent < Test::Unit::TestCase
18
+ def setup
19
+ @file = Tempfile.new(["test-file-content", ".txt"])
20
+ end
21
+
22
+ def test_size
23
+ body = "Hello"
24
+ assert_equal(body.bytesize, content(body).size)
25
+ end
26
+
27
+ def test_path
28
+ assert_equal(@file.path, content.path)
29
+ end
30
+
31
+ def test_body
32
+ body = "Hello"
33
+ assert_equal(body, content(body).body)
34
+ end
35
+
36
+ def test_open
37
+ body = "Hello"
38
+ assert_equal(body, content(body).open {|file| file.read})
39
+ end
40
+
41
+ private
42
+ def write(string)
43
+ @file.write(string)
44
+ @file.flush
45
+ end
46
+
47
+ def content(string=nil)
48
+ write(string) if string
49
+ ChupaText::FileContent.new(@file.path)
50
+ end
51
+ end
@@ -0,0 +1,48 @@
1
+ # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ class TestMIMETypeRegistry < Test::Unit::TestCase
18
+ def setup
19
+ @registry = ChupaText::MIMETypeRegistry.new
20
+ end
21
+
22
+ sub_test_case("register") do
23
+ def test_multiple
24
+ @registry.register("csv", "text/csv")
25
+ @registry.register("txt", "text/plain")
26
+ assert_equal("text/csv", @registry.find("csv"))
27
+ end
28
+ end
29
+
30
+ sub_test_case("find") do
31
+ def setup
32
+ super
33
+ @registry.register("csv", "text/csv")
34
+ end
35
+
36
+ def test_nil
37
+ assert_nil(@registry.find(nil))
38
+ end
39
+
40
+ def test_nonexistent
41
+ assert_nil(@registry.find("txt"))
42
+ end
43
+
44
+ def test_existent
45
+ assert_equal("text/csv", @registry.find("csv"))
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,36 @@
1
+ # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ class TestTextData < Test::Unit::TestCase
18
+ def test_mime_type
19
+ assert_equal("text/plain", text_data("").mime_type)
20
+ end
21
+
22
+ def test_body
23
+ body = "Hello"
24
+ assert_equal(body, text_data(body).body)
25
+ end
26
+
27
+ def test_size
28
+ body = "Hello"
29
+ assert_equal(body.bytesize, text_data(body).size)
30
+ end
31
+
32
+ private
33
+ def text_data(text)
34
+ ChupaText::TextData.new(text)
35
+ end
36
+ end
@@ -0,0 +1,103 @@
1
+ # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ class TestVirtualContent < Test::Unit::TestCase
18
+ private
19
+ def input(string)
20
+ StringIO.new(string)
21
+ end
22
+
23
+ def content(string, original_path=nil)
24
+ ChupaText::VirtualContent.new(input(string), original_path)
25
+ end
26
+
27
+ sub_test_case("small data") do
28
+ def setup
29
+ @body = "Hello"
30
+ end
31
+
32
+ def test_size
33
+ assert_equal(@body.bytesize, content.size)
34
+ end
35
+
36
+ def test_path
37
+ assert_equal(@body, File.read(content.path))
38
+ end
39
+
40
+ def test_body
41
+ assert_equal(@body, content.body)
42
+ end
43
+
44
+ def test_open
45
+ assert_equal(@body, content.open {|file| file.read})
46
+ end
47
+
48
+ private
49
+ def content
50
+ super(@body)
51
+ end
52
+ end
53
+
54
+ sub_test_case("large data") do
55
+ def setup
56
+ @body = "X" * (ChupaText::VirtualContent::BUFFER_SIZE + 1)
57
+ end
58
+
59
+ def test_size
60
+ assert_equal(@body.bytesize, content.size)
61
+ end
62
+
63
+ def test_path
64
+ assert_equal(@body, File.read(content.path))
65
+ end
66
+
67
+ def test_body
68
+ assert_equal(@body, content.body)
69
+ end
70
+
71
+ def test_open
72
+ assert_equal(@body, content.open {|file| file.read})
73
+ end
74
+
75
+ private
76
+ def content
77
+ super(@body)
78
+ end
79
+ end
80
+
81
+ sub_test_case("original path") do
82
+ def test_extension
83
+ assert_equal(".txt", File.extname(path("hello.txt")))
84
+ end
85
+
86
+ def test_extension_only
87
+ assert_equal(".txt", File.extname(path(".txt")))
88
+ end
89
+
90
+ def test_no_extension
91
+ assert_equal("", File.extname(path("hello")))
92
+ end
93
+
94
+ def test_nil
95
+ assert_equal("", File.extname(path(nil)))
96
+ end
97
+
98
+ private
99
+ def path(original_path)
100
+ content("", original_path).path
101
+ end
102
+ end
103
+ end