yomu 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +10 -10
- data/lib/yomu.rb +21 -8
- data/lib/yomu/version.rb +1 -1
- data/test/samples/sample filename with spaces.pages +0 -0
- data/test/yomu_test.rb +43 -5
- metadata +5 -2
data/README.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
# Yomu 読む
|
2
|
-
[Yomu](http://github.com/
|
2
|
+
[Yomu](http://erol.github.com/yomu) is a library for extracting text and metadata from files and documents using the [Apache TIKA](http://tika.apache.org/) content analysis toolkit.
|
3
3
|
|
4
4
|
Here are some of the formats supported:
|
5
5
|
|
@@ -10,7 +10,7 @@ Here are some of the formats supported:
|
|
10
10
|
- Rich Text Format (.rtf)
|
11
11
|
- Portable Document Format (.pdf)
|
12
12
|
|
13
|
-
For the complete list of supported formats, please visit the Apache
|
13
|
+
For the complete list of supported formats, please visit the Apache TIKA
|
14
14
|
[Supported Document Formats](http://tika.apache.org/0.9/formats.html) page.
|
15
15
|
|
16
16
|
## Installation and Dependencies
|
@@ -27,7 +27,7 @@ Or install it yourself as:
|
|
27
27
|
|
28
28
|
$ gem install yomu
|
29
29
|
|
30
|
-
Yomu packages the Apache
|
30
|
+
**Yomu packages the Apache TIKA application jar and requires a working JRE for it to work.**
|
31
31
|
|
32
32
|
## Usage
|
33
33
|
|
@@ -40,21 +40,21 @@ You can extract text by calling `Yomu.read` directly:
|
|
40
40
|
data = File.read 'sample.pages'
|
41
41
|
text = Yomu.read :text, data
|
42
42
|
|
43
|
-
|
43
|
+
### Reading text from a given filename
|
44
44
|
|
45
45
|
You can also make a new instance of Yomu and pass a filename.
|
46
46
|
|
47
47
|
yomu = Yomu.new 'sample.pages'
|
48
48
|
text = yomu.text
|
49
49
|
|
50
|
-
|
50
|
+
### Reading text from a given URL
|
51
51
|
|
52
52
|
This is useful for reading remote files, like documents hosted on Amazon S3.
|
53
53
|
|
54
54
|
yomu = Yomu.new 'http://svn.apache.org/repos/asf/poi/trunk/test-data/document/sample.docx'
|
55
55
|
text = yomu.text
|
56
56
|
|
57
|
-
|
57
|
+
### Reading text from a stream
|
58
58
|
|
59
59
|
Yomu can also read from a stream or any object that responds to `read`, including Ruby on Rails' and Sinatra's file uploads:
|
60
60
|
|
@@ -66,8 +66,8 @@ Yomu can also read from a stream or any object that responds to `read`, includin
|
|
66
66
|
## Contributing
|
67
67
|
|
68
68
|
1. Fork it
|
69
|
-
2. Create your feature branch (`git checkout -b my-new-feature`)
|
70
|
-
3. Create tests and make them pass (`rake test`)
|
71
|
-
4. Commit your changes (`git commit -am 'Added some feature'`)
|
72
|
-
5. Push to the branch (`git push origin my-new-feature`)
|
69
|
+
2. Create your feature branch ( `git checkout -b my-new-feature` )
|
70
|
+
3. Create tests and make them pass ( `rake test` )
|
71
|
+
4. Commit your changes ( `git commit -am 'Added some feature'` )
|
72
|
+
5. Push to the branch ( `git push origin my-new-feature` )
|
73
73
|
6. Create a new Pull Request
|
data/lib/yomu.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
require
|
1
|
+
require 'yomu/version'
|
2
2
|
|
3
3
|
require 'net/http'
|
4
4
|
require 'yaml'
|
@@ -46,11 +46,12 @@ class Yomu
|
|
46
46
|
|
47
47
|
def initialize(input)
|
48
48
|
if input.is_a? String
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
else
|
49
|
+
if input =~ URI::regexp
|
50
|
+
@uri = URI.parse input
|
51
|
+
elsif File.exists? input
|
53
52
|
@path = input
|
53
|
+
else
|
54
|
+
raise Errno::ENOENT.new "missing file or invalid URI - #{input}"
|
54
55
|
end
|
55
56
|
elsif input.respond_to? :read
|
56
57
|
@stream = input
|
@@ -81,16 +82,28 @@ class Yomu
|
|
81
82
|
@metadata = Yomu.read :metadata, data
|
82
83
|
end
|
83
84
|
|
85
|
+
def path?
|
86
|
+
defined? @path
|
87
|
+
end
|
88
|
+
|
89
|
+
def uri?
|
90
|
+
defined? @uri
|
91
|
+
end
|
92
|
+
|
93
|
+
def stream?
|
94
|
+
defined? @stream
|
95
|
+
end
|
96
|
+
|
84
97
|
protected
|
85
98
|
|
86
99
|
def data
|
87
100
|
return @data if defined? @data
|
88
101
|
|
89
|
-
if
|
102
|
+
if path?
|
90
103
|
@data = File.read @path
|
91
|
-
elsif
|
104
|
+
elsif uri?
|
92
105
|
@data = Net::HTTP.get @uri
|
93
|
-
elsif
|
106
|
+
elsif stream?
|
94
107
|
@data = @stream.read
|
95
108
|
end
|
96
109
|
|
data/lib/yomu/version.rb
CHANGED
Binary file
|
data/test/yomu_test.rb
CHANGED
@@ -23,26 +23,64 @@ class YomuTest < MiniTest::Unit::TestCase
|
|
23
23
|
end
|
24
24
|
end
|
25
25
|
|
26
|
-
def
|
26
|
+
def test_yomu_can_be_initialized_with_a_root_path
|
27
27
|
assert_silent do
|
28
|
-
Yomu.new '
|
28
|
+
yomu = Yomu.new File.join(File.dirname(__FILE__), 'samples/sample.pages')
|
29
|
+
|
30
|
+
assert_block { yomu.path? }
|
31
|
+
assert_block { !yomu.uri? }
|
32
|
+
assert_block { !yomu.stream? }
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def test_yomu_can_be_initialized_with_a_relative_path
|
37
|
+
assert_silent do
|
38
|
+
yomu = Yomu.new 'test/samples/sample.pages'
|
39
|
+
|
40
|
+
assert_block { yomu.path? }
|
41
|
+
assert_block { !yomu.uri? }
|
42
|
+
assert_block { !yomu.stream? }
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def test_yomu_can_be_initialized_with_a_path_with_spaces
|
47
|
+
assert_silent do
|
48
|
+
yomu = Yomu.new 'test/samples/sample filename with spaces.pages'
|
49
|
+
|
50
|
+
assert_block { yomu.path? }
|
51
|
+
assert_block { !yomu.uri? }
|
52
|
+
assert_block { !yomu.stream? }
|
29
53
|
end
|
30
54
|
end
|
31
55
|
|
32
|
-
def
|
56
|
+
def test_yomu_can_be_initialized_with_a_uri
|
33
57
|
assert_silent do
|
34
|
-
Yomu.new 'http://svn.apache.org/repos/asf/poi/trunk/test-data/document/sample.docx'
|
58
|
+
yomu = Yomu.new 'http://svn.apache.org/repos/asf/poi/trunk/test-data/document/sample.docx'
|
59
|
+
|
60
|
+
assert_block { yomu.uri? }
|
61
|
+
assert_block { !yomu.path? }
|
62
|
+
assert_block { !yomu.stream? }
|
35
63
|
end
|
36
64
|
end
|
37
65
|
|
38
66
|
def test_yomu_can_be_initialized_with_a_stream_or_object_that_can_be_read
|
39
67
|
assert_silent do
|
40
68
|
File.open 'test/samples/sample.pages', 'r' do |file|
|
41
|
-
Yomu.new file
|
69
|
+
yomu = Yomu.new file
|
70
|
+
|
71
|
+
assert_block { yomu.stream? }
|
72
|
+
assert_block { !yomu.path? }
|
73
|
+
assert_block { !yomu.uri? }
|
42
74
|
end
|
43
75
|
end
|
44
76
|
end
|
45
77
|
|
78
|
+
def test_yomu_cannot_be_initialized_with_a_path_to_a_missing_file
|
79
|
+
assert_raises Errno::ENOENT do
|
80
|
+
Yomu.new 'test/sample/missing.pages'
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
46
84
|
def test_yomu_cannot_be_initialized_with_other_objects
|
47
85
|
[nil, 1, 1.1].each do |object|
|
48
86
|
assert_raises TypeError do
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: yomu
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-04-11 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
14
|
description: Yomu is a library for extracting text and metadata using the Apache TIKA
|
15
15
|
content analysis toolkit.
|
@@ -28,6 +28,7 @@ files:
|
|
28
28
|
- jar/tika-app-1.1.jar
|
29
29
|
- lib/yomu.rb
|
30
30
|
- lib/yomu/version.rb
|
31
|
+
- test/samples/sample filename with spaces.pages
|
31
32
|
- test/samples/sample.pages
|
32
33
|
- test/test_helper.rb
|
33
34
|
- test/yomu_test.rb
|
@@ -58,6 +59,8 @@ specification_version: 3
|
|
58
59
|
summary: Yomu is a library for extracting text and metadata using the Apache TIKA
|
59
60
|
content analysis toolkit.
|
60
61
|
test_files:
|
62
|
+
- test/samples/sample filename with spaces.pages
|
61
63
|
- test/samples/sample.pages
|
62
64
|
- test/test_helper.rb
|
63
65
|
- test/yomu_test.rb
|
66
|
+
has_rdoc:
|