yomu 0.1.9 → 0.1.10
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.md +33 -18
- data/jar/{tika-app-1.4.jar → tika-app-1.5.jar} +0 -0
- data/lib/yomu.rb +16 -1
- data/lib/yomu/version.rb +1 -1
- metadata +18 -30
- data/test/helper.rb +0 -3
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 23b6ed327cc94e45aea195294d84bb8dd669f1a6
|
4
|
+
data.tar.gz: c5ef3d9857242820b04f7c76f2f90834a7235a62
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 195f73de69ed781c97f10fffac2971a258a2f23ede81e6ebbc1ce34400c38de4fccac93b765df37c0ca3ab105a95a682977fcc6c71a3d86f810a674e2226ed0e
|
7
|
+
data.tar.gz: a5f562f9de92eedf29a2cf889f1083f03d19928f10998998131c06f3457613e40e06a7fd2f116a88bb74058e1bceb7fe62402aa6204e0fe1a81adcc71cf1c422
|
data/README.md
CHANGED
@@ -1,4 +1,7 @@
|
|
1
|
+
![Google Analytics](https://ga-beacon.appspot.com/UA-31066891-2/yomu/code)
|
2
|
+
|
1
3
|
# Yomu 読む
|
4
|
+
|
2
5
|
[Yomu](http://erol.github.com/yomu) is a library for extracting text and metadata from files and documents using the [Apache Tika](http://tika.apache.org/) content analysis toolkit.
|
3
6
|
|
4
7
|
Here are some of the formats supported:
|
@@ -17,50 +20,62 @@ For the complete list of supported formats, please visit the Apache Tika
|
|
17
20
|
|
18
21
|
Text, metadata and MIME type information can be extracted by calling `Yomu.read` directly:
|
19
22
|
|
20
|
-
|
23
|
+
```ruby
|
24
|
+
require 'yomu'
|
21
25
|
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
+
data = File.read 'sample.pages'
|
27
|
+
text = Yomu.read :text, data
|
28
|
+
metadata = Yomu.read :metadata, data
|
29
|
+
mimetype = Yomu.read :mimetype, data
|
30
|
+
```
|
26
31
|
|
27
32
|
### Reading text from a given filename
|
28
33
|
|
29
34
|
Create a new instance of Yomu and pass a filename.
|
30
35
|
|
31
|
-
|
32
|
-
|
36
|
+
```ruby
|
37
|
+
yomu = Yomu.new 'sample.pages'
|
38
|
+
text = yomu.text
|
39
|
+
```
|
33
40
|
|
34
41
|
### Reading text from a given URL
|
35
42
|
|
36
43
|
This is useful for reading remote files, like documents hosted on Amazon S3.
|
37
44
|
|
38
|
-
|
39
|
-
|
45
|
+
```ruby
|
46
|
+
yomu = Yomu.new 'http://svn.apache.org/repos/asf/poi/trunk/test-data/document/sample.docx'
|
47
|
+
text = yomu.text
|
48
|
+
```
|
40
49
|
|
41
50
|
### Reading text from a stream
|
42
51
|
|
43
52
|
Yomu can also read from a stream or any object that responds to `read`, including file uploads from Ruby on Rails or Sinatra.
|
44
53
|
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
54
|
+
```ruby
|
55
|
+
post '/:name/:filename' do
|
56
|
+
yomu = Yomu.new params[:data][:tempfile]
|
57
|
+
yomu.text
|
58
|
+
end
|
59
|
+
```
|
49
60
|
|
50
61
|
### Reading metadata
|
51
62
|
|
52
63
|
Metadata is returned as a hash.
|
53
64
|
|
54
|
-
|
55
|
-
|
65
|
+
```ruby
|
66
|
+
yomu = Yomu.new 'sample.pages'
|
67
|
+
yomu.metadata['Content-Type'] #=> "application/vnd.apple.pages"
|
68
|
+
```
|
56
69
|
|
57
70
|
### Reading MIME types
|
58
71
|
|
59
72
|
MIME type is returned as a MIME::Type object.
|
60
73
|
|
61
|
-
|
62
|
-
|
63
|
-
|
74
|
+
```ruby
|
75
|
+
yomu = Yomu.new 'sample.docx'
|
76
|
+
yomu.mimetype.content_type #=> "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
77
|
+
yomu.mimetype.extensions #=> ['docx']
|
78
|
+
```
|
64
79
|
|
65
80
|
## Installation and Dependencies
|
66
81
|
|
Binary file
|
data/lib/yomu.rb
CHANGED
@@ -6,7 +6,7 @@ require 'yaml'
|
|
6
6
|
|
7
7
|
class Yomu
|
8
8
|
GEMPATH = File.dirname(File.dirname(__FILE__))
|
9
|
-
JARPATH = File.join(Yomu::GEMPATH, 'jar', 'tika-app-1
|
9
|
+
JARPATH = File.join(Yomu::GEMPATH, 'jar', 'tika-app-1.5.jar')
|
10
10
|
|
11
11
|
# Read text or metadata from a data buffer.
|
12
12
|
#
|
@@ -18,6 +18,8 @@ class Yomu
|
|
18
18
|
switch = case type
|
19
19
|
when :text
|
20
20
|
'-t'
|
21
|
+
when :html
|
22
|
+
'-h'
|
21
23
|
when :metadata
|
22
24
|
'-m'
|
23
25
|
when :mimetype
|
@@ -33,6 +35,8 @@ class Yomu
|
|
33
35
|
case type
|
34
36
|
when :text
|
35
37
|
result
|
38
|
+
when :html
|
39
|
+
result
|
36
40
|
when :metadata
|
37
41
|
YAML.load quote(result)
|
38
42
|
when :mimetype
|
@@ -81,6 +85,17 @@ class Yomu
|
|
81
85
|
@text = Yomu.read :text, data
|
82
86
|
end
|
83
87
|
|
88
|
+
# Returns the text content of the Yomu document in HTML.
|
89
|
+
#
|
90
|
+
# yomu = Yomu.new 'sample.pages'
|
91
|
+
# yomu.html
|
92
|
+
|
93
|
+
def html
|
94
|
+
return @text if defined? @text
|
95
|
+
|
96
|
+
@text = Yomu.read :html, data
|
97
|
+
end
|
98
|
+
|
84
99
|
# Returns the metadata hash of the Yomu document.
|
85
100
|
#
|
86
101
|
# yomu = Yomu.new 'sample.pages'
|
data/lib/yomu/version.rb
CHANGED
metadata
CHANGED
@@ -1,78 +1,69 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: yomu
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
5
|
-
prerelease:
|
4
|
+
version: 0.1.10
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- Erol Fornoles
|
9
8
|
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
date:
|
11
|
+
date: 2014-02-27 00:00:00.000000000 Z
|
13
12
|
dependencies:
|
14
13
|
- !ruby/object:Gem::Dependency
|
15
14
|
name: mime-types
|
16
15
|
requirement: !ruby/object:Gem::Requirement
|
17
|
-
none: false
|
18
16
|
requirements:
|
19
|
-
- - ~>
|
17
|
+
- - "~>"
|
20
18
|
- !ruby/object:Gem::Version
|
21
19
|
version: '1.23'
|
22
20
|
type: :runtime
|
23
21
|
prerelease: false
|
24
22
|
version_requirements: !ruby/object:Gem::Requirement
|
25
|
-
none: false
|
26
23
|
requirements:
|
27
|
-
- - ~>
|
24
|
+
- - "~>"
|
28
25
|
- !ruby/object:Gem::Version
|
29
26
|
version: '1.23'
|
30
27
|
- !ruby/object:Gem::Dependency
|
31
28
|
name: bundler
|
32
29
|
requirement: !ruby/object:Gem::Requirement
|
33
|
-
none: false
|
34
30
|
requirements:
|
35
|
-
- - ~>
|
31
|
+
- - "~>"
|
36
32
|
- !ruby/object:Gem::Version
|
37
33
|
version: '1.3'
|
38
34
|
type: :development
|
39
35
|
prerelease: false
|
40
36
|
version_requirements: !ruby/object:Gem::Requirement
|
41
|
-
none: false
|
42
37
|
requirements:
|
43
|
-
- - ~>
|
38
|
+
- - "~>"
|
44
39
|
- !ruby/object:Gem::Version
|
45
40
|
version: '1.3'
|
46
41
|
- !ruby/object:Gem::Dependency
|
47
42
|
name: rake
|
48
43
|
requirement: !ruby/object:Gem::Requirement
|
49
|
-
none: false
|
50
44
|
requirements:
|
51
|
-
- -
|
45
|
+
- - ">="
|
52
46
|
- !ruby/object:Gem::Version
|
53
47
|
version: '0'
|
54
48
|
type: :development
|
55
49
|
prerelease: false
|
56
50
|
version_requirements: !ruby/object:Gem::Requirement
|
57
|
-
none: false
|
58
51
|
requirements:
|
59
|
-
- -
|
52
|
+
- - ">="
|
60
53
|
- !ruby/object:Gem::Version
|
61
54
|
version: '0'
|
62
55
|
- !ruby/object:Gem::Dependency
|
63
56
|
name: rspec
|
64
57
|
requirement: !ruby/object:Gem::Requirement
|
65
|
-
none: false
|
66
58
|
requirements:
|
67
|
-
- - ~>
|
59
|
+
- - "~>"
|
68
60
|
- !ruby/object:Gem::Version
|
69
61
|
version: '2.14'
|
70
62
|
type: :development
|
71
63
|
prerelease: false
|
72
64
|
version_requirements: !ruby/object:Gem::Requirement
|
73
|
-
none: false
|
74
65
|
requirements:
|
75
|
-
- - ~>
|
66
|
+
- - "~>"
|
76
67
|
- !ruby/object:Gem::Version
|
77
68
|
version: '2.14'
|
78
69
|
description: Read text and metadata from files and documents (.doc, .docx, .pages,
|
@@ -83,14 +74,14 @@ executables: []
|
|
83
74
|
extensions: []
|
84
75
|
extra_rdoc_files: []
|
85
76
|
files:
|
86
|
-
- .gitignore
|
87
|
-
- .rspec
|
77
|
+
- ".gitignore"
|
78
|
+
- ".rspec"
|
88
79
|
- Gemfile
|
89
80
|
- LICENSE
|
90
81
|
- NOTICE.txt
|
91
82
|
- README.md
|
92
83
|
- Rakefile
|
93
|
-
- jar/tika-app-1.
|
84
|
+
- jar/tika-app-1.5.jar
|
94
85
|
- lib/yomu.rb
|
95
86
|
- lib/yomu/version.rb
|
96
87
|
- spec/helper.rb
|
@@ -99,32 +90,30 @@ files:
|
|
99
90
|
- spec/samples/sample.docx
|
100
91
|
- spec/samples/sample.pages
|
101
92
|
- spec/yomu_spec.rb
|
102
|
-
- test/helper.rb
|
103
93
|
- yomu.gemspec
|
104
94
|
homepage: http://erol.github.com/yomu
|
105
95
|
licenses:
|
106
96
|
- MIT
|
97
|
+
metadata: {}
|
107
98
|
post_install_message:
|
108
99
|
rdoc_options: []
|
109
100
|
require_paths:
|
110
101
|
- lib
|
111
102
|
required_ruby_version: !ruby/object:Gem::Requirement
|
112
|
-
none: false
|
113
103
|
requirements:
|
114
|
-
- -
|
104
|
+
- - ">="
|
115
105
|
- !ruby/object:Gem::Version
|
116
106
|
version: '0'
|
117
107
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
118
|
-
none: false
|
119
108
|
requirements:
|
120
|
-
- -
|
109
|
+
- - ">="
|
121
110
|
- !ruby/object:Gem::Version
|
122
111
|
version: '0'
|
123
112
|
requirements: []
|
124
113
|
rubyforge_project:
|
125
|
-
rubygems_version:
|
114
|
+
rubygems_version: 2.2.0
|
126
115
|
signing_key:
|
127
|
-
specification_version:
|
116
|
+
specification_version: 4
|
128
117
|
summary: Read text and metadata from files and documents (.doc, .docx, .pages, .odt,
|
129
118
|
.rtf, .pdf)
|
130
119
|
test_files:
|
@@ -134,4 +123,3 @@ test_files:
|
|
134
123
|
- spec/samples/sample.docx
|
135
124
|
- spec/samples/sample.pages
|
136
125
|
- spec/yomu_spec.rb
|
137
|
-
- test/helper.rb
|
data/test/helper.rb
DELETED