yomu 0.1.9 → 0.1.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +33 -18
- data/jar/{tika-app-1.4.jar → tika-app-1.5.jar} +0 -0
- data/lib/yomu.rb +16 -1
- data/lib/yomu/version.rb +1 -1
- metadata +18 -30
- data/test/helper.rb +0 -3
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 23b6ed327cc94e45aea195294d84bb8dd669f1a6
|
4
|
+
data.tar.gz: c5ef3d9857242820b04f7c76f2f90834a7235a62
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 195f73de69ed781c97f10fffac2971a258a2f23ede81e6ebbc1ce34400c38de4fccac93b765df37c0ca3ab105a95a682977fcc6c71a3d86f810a674e2226ed0e
|
7
|
+
data.tar.gz: a5f562f9de92eedf29a2cf889f1083f03d19928f10998998131c06f3457613e40e06a7fd2f116a88bb74058e1bceb7fe62402aa6204e0fe1a81adcc71cf1c422
|
data/README.md
CHANGED
@@ -1,4 +1,7 @@
|
|
1
|
+

|
2
|
+
|
1
3
|
# Yomu 読む
|
4
|
+
|
2
5
|
[Yomu](http://erol.github.com/yomu) is a library for extracting text and metadata from files and documents using the [Apache Tika](http://tika.apache.org/) content analysis toolkit.
|
3
6
|
|
4
7
|
Here are some of the formats supported:
|
@@ -17,50 +20,62 @@ For the complete list of supported formats, please visit the Apache Tika
|
|
17
20
|
|
18
21
|
Text, metadata and MIME type information can be extracted by calling `Yomu.read` directly:
|
19
22
|
|
20
|
-
|
23
|
+
```ruby
|
24
|
+
require 'yomu'
|
21
25
|
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
+
data = File.read 'sample.pages'
|
27
|
+
text = Yomu.read :text, data
|
28
|
+
metadata = Yomu.read :metadata, data
|
29
|
+
mimetype = Yomu.read :mimetype, data
|
30
|
+
```
|
26
31
|
|
27
32
|
### Reading text from a given filename
|
28
33
|
|
29
34
|
Create a new instance of Yomu and pass a filename.
|
30
35
|
|
31
|
-
|
32
|
-
|
36
|
+
```ruby
|
37
|
+
yomu = Yomu.new 'sample.pages'
|
38
|
+
text = yomu.text
|
39
|
+
```
|
33
40
|
|
34
41
|
### Reading text from a given URL
|
35
42
|
|
36
43
|
This is useful for reading remote files, like documents hosted on Amazon S3.
|
37
44
|
|
38
|
-
|
39
|
-
|
45
|
+
```ruby
|
46
|
+
yomu = Yomu.new 'http://svn.apache.org/repos/asf/poi/trunk/test-data/document/sample.docx'
|
47
|
+
text = yomu.text
|
48
|
+
```
|
40
49
|
|
41
50
|
### Reading text from a stream
|
42
51
|
|
43
52
|
Yomu can also read from a stream or any object that responds to `read`, including file uploads from Ruby on Rails or Sinatra.
|
44
53
|
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
54
|
+
```ruby
|
55
|
+
post '/:name/:filename' do
|
56
|
+
yomu = Yomu.new params[:data][:tempfile]
|
57
|
+
yomu.text
|
58
|
+
end
|
59
|
+
```
|
49
60
|
|
50
61
|
### Reading metadata
|
51
62
|
|
52
63
|
Metadata is returned as a hash.
|
53
64
|
|
54
|
-
|
55
|
-
|
65
|
+
```ruby
|
66
|
+
yomu = Yomu.new 'sample.pages'
|
67
|
+
yomu.metadata['Content-Type'] #=> "application/vnd.apple.pages"
|
68
|
+
```
|
56
69
|
|
57
70
|
### Reading MIME types
|
58
71
|
|
59
72
|
MIME type is returned as a MIME::Type object.
|
60
73
|
|
61
|
-
|
62
|
-
|
63
|
-
|
74
|
+
```ruby
|
75
|
+
yomu = Yomu.new 'sample.docx'
|
76
|
+
yomu.mimetype.content_type #=> "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
77
|
+
yomu.mimetype.extensions #=> ['docx']
|
78
|
+
```
|
64
79
|
|
65
80
|
## Installation and Dependencies
|
66
81
|
|
Binary file
|
data/lib/yomu.rb
CHANGED
@@ -6,7 +6,7 @@ require 'yaml'
|
|
6
6
|
|
7
7
|
class Yomu
|
8
8
|
GEMPATH = File.dirname(File.dirname(__FILE__))
|
9
|
-
JARPATH = File.join(Yomu::GEMPATH, 'jar', 'tika-app-1
|
9
|
+
JARPATH = File.join(Yomu::GEMPATH, 'jar', 'tika-app-1.5.jar')
|
10
10
|
|
11
11
|
# Read text or metadata from a data buffer.
|
12
12
|
#
|
@@ -18,6 +18,8 @@ class Yomu
|
|
18
18
|
switch = case type
|
19
19
|
when :text
|
20
20
|
'-t'
|
21
|
+
when :html
|
22
|
+
'-h'
|
21
23
|
when :metadata
|
22
24
|
'-m'
|
23
25
|
when :mimetype
|
@@ -33,6 +35,8 @@ class Yomu
|
|
33
35
|
case type
|
34
36
|
when :text
|
35
37
|
result
|
38
|
+
when :html
|
39
|
+
result
|
36
40
|
when :metadata
|
37
41
|
YAML.load quote(result)
|
38
42
|
when :mimetype
|
@@ -81,6 +85,17 @@ class Yomu
|
|
81
85
|
@text = Yomu.read :text, data
|
82
86
|
end
|
83
87
|
|
88
|
+
# Returns the text content of the Yomu document in HTML.
|
89
|
+
#
|
90
|
+
# yomu = Yomu.new 'sample.pages'
|
91
|
+
# yomu.html
|
92
|
+
|
93
|
+
def html
|
94
|
+
return @text if defined? @text
|
95
|
+
|
96
|
+
@text = Yomu.read :html, data
|
97
|
+
end
|
98
|
+
|
84
99
|
# Returns the metadata hash of the Yomu document.
|
85
100
|
#
|
86
101
|
# yomu = Yomu.new 'sample.pages'
|
data/lib/yomu/version.rb
CHANGED
metadata
CHANGED
@@ -1,78 +1,69 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: yomu
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
5
|
-
prerelease:
|
4
|
+
version: 0.1.10
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- Erol Fornoles
|
9
8
|
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
date:
|
11
|
+
date: 2014-02-27 00:00:00.000000000 Z
|
13
12
|
dependencies:
|
14
13
|
- !ruby/object:Gem::Dependency
|
15
14
|
name: mime-types
|
16
15
|
requirement: !ruby/object:Gem::Requirement
|
17
|
-
none: false
|
18
16
|
requirements:
|
19
|
-
- - ~>
|
17
|
+
- - "~>"
|
20
18
|
- !ruby/object:Gem::Version
|
21
19
|
version: '1.23'
|
22
20
|
type: :runtime
|
23
21
|
prerelease: false
|
24
22
|
version_requirements: !ruby/object:Gem::Requirement
|
25
|
-
none: false
|
26
23
|
requirements:
|
27
|
-
- - ~>
|
24
|
+
- - "~>"
|
28
25
|
- !ruby/object:Gem::Version
|
29
26
|
version: '1.23'
|
30
27
|
- !ruby/object:Gem::Dependency
|
31
28
|
name: bundler
|
32
29
|
requirement: !ruby/object:Gem::Requirement
|
33
|
-
none: false
|
34
30
|
requirements:
|
35
|
-
- - ~>
|
31
|
+
- - "~>"
|
36
32
|
- !ruby/object:Gem::Version
|
37
33
|
version: '1.3'
|
38
34
|
type: :development
|
39
35
|
prerelease: false
|
40
36
|
version_requirements: !ruby/object:Gem::Requirement
|
41
|
-
none: false
|
42
37
|
requirements:
|
43
|
-
- - ~>
|
38
|
+
- - "~>"
|
44
39
|
- !ruby/object:Gem::Version
|
45
40
|
version: '1.3'
|
46
41
|
- !ruby/object:Gem::Dependency
|
47
42
|
name: rake
|
48
43
|
requirement: !ruby/object:Gem::Requirement
|
49
|
-
none: false
|
50
44
|
requirements:
|
51
|
-
- -
|
45
|
+
- - ">="
|
52
46
|
- !ruby/object:Gem::Version
|
53
47
|
version: '0'
|
54
48
|
type: :development
|
55
49
|
prerelease: false
|
56
50
|
version_requirements: !ruby/object:Gem::Requirement
|
57
|
-
none: false
|
58
51
|
requirements:
|
59
|
-
- -
|
52
|
+
- - ">="
|
60
53
|
- !ruby/object:Gem::Version
|
61
54
|
version: '0'
|
62
55
|
- !ruby/object:Gem::Dependency
|
63
56
|
name: rspec
|
64
57
|
requirement: !ruby/object:Gem::Requirement
|
65
|
-
none: false
|
66
58
|
requirements:
|
67
|
-
- - ~>
|
59
|
+
- - "~>"
|
68
60
|
- !ruby/object:Gem::Version
|
69
61
|
version: '2.14'
|
70
62
|
type: :development
|
71
63
|
prerelease: false
|
72
64
|
version_requirements: !ruby/object:Gem::Requirement
|
73
|
-
none: false
|
74
65
|
requirements:
|
75
|
-
- - ~>
|
66
|
+
- - "~>"
|
76
67
|
- !ruby/object:Gem::Version
|
77
68
|
version: '2.14'
|
78
69
|
description: Read text and metadata from files and documents (.doc, .docx, .pages,
|
@@ -83,14 +74,14 @@ executables: []
|
|
83
74
|
extensions: []
|
84
75
|
extra_rdoc_files: []
|
85
76
|
files:
|
86
|
-
- .gitignore
|
87
|
-
- .rspec
|
77
|
+
- ".gitignore"
|
78
|
+
- ".rspec"
|
88
79
|
- Gemfile
|
89
80
|
- LICENSE
|
90
81
|
- NOTICE.txt
|
91
82
|
- README.md
|
92
83
|
- Rakefile
|
93
|
-
- jar/tika-app-1.
|
84
|
+
- jar/tika-app-1.5.jar
|
94
85
|
- lib/yomu.rb
|
95
86
|
- lib/yomu/version.rb
|
96
87
|
- spec/helper.rb
|
@@ -99,32 +90,30 @@ files:
|
|
99
90
|
- spec/samples/sample.docx
|
100
91
|
- spec/samples/sample.pages
|
101
92
|
- spec/yomu_spec.rb
|
102
|
-
- test/helper.rb
|
103
93
|
- yomu.gemspec
|
104
94
|
homepage: http://erol.github.com/yomu
|
105
95
|
licenses:
|
106
96
|
- MIT
|
97
|
+
metadata: {}
|
107
98
|
post_install_message:
|
108
99
|
rdoc_options: []
|
109
100
|
require_paths:
|
110
101
|
- lib
|
111
102
|
required_ruby_version: !ruby/object:Gem::Requirement
|
112
|
-
none: false
|
113
103
|
requirements:
|
114
|
-
- -
|
104
|
+
- - ">="
|
115
105
|
- !ruby/object:Gem::Version
|
116
106
|
version: '0'
|
117
107
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
118
|
-
none: false
|
119
108
|
requirements:
|
120
|
-
- -
|
109
|
+
- - ">="
|
121
110
|
- !ruby/object:Gem::Version
|
122
111
|
version: '0'
|
123
112
|
requirements: []
|
124
113
|
rubyforge_project:
|
125
|
-
rubygems_version:
|
114
|
+
rubygems_version: 2.2.0
|
126
115
|
signing_key:
|
127
|
-
specification_version:
|
116
|
+
specification_version: 4
|
128
117
|
summary: Read text and metadata from files and documents (.doc, .docx, .pages, .odt,
|
129
118
|
.rtf, .pdf)
|
130
119
|
test_files:
|
@@ -134,4 +123,3 @@ test_files:
|
|
134
123
|
- spec/samples/sample.docx
|
135
124
|
- spec/samples/sample.pages
|
136
125
|
- spec/yomu_spec.rb
|
137
|
-
- test/helper.rb
|
data/test/helper.rb
DELETED