rika 1.0.0-java → 1.1.1-java
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +2 -0
- data/README.md +42 -24
- data/lib/rika.rb +17 -1
- data/lib/rika/version.rb +1 -1
- data/spec/rika_spec.rb +18 -1
- metadata +9 -9
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -25,42 +25,60 @@ Or install it yourself as:
|
|
25
25
|
|
26
26
|
## Usage
|
27
27
|
|
28
|
-
|
28
|
+
For a quick start with the simplest use cases, the following functions
|
29
|
+
are provided to get what you need in a single function call, for your convenience:
|
30
|
+
|
31
|
+
```ruby
|
32
|
+
require 'rika'
|
33
|
+
|
34
|
+
content = Rika.parse_content('document.pdf') # string containing all content text
|
35
|
+
metadata = Rika.parse_metadata('document.pdf') # hash containing the document metadata
|
36
|
+
content, metadata = Rika.parse_content_and_metadata('document.pdf') # both of the above
|
37
|
+
```
|
38
|
+
|
39
|
+
For other use cases and finer control, you can work directly with the Rika::Parser object:
|
40
|
+
|
29
41
|
```ruby
|
30
|
-
|
42
|
+
require 'rika'
|
31
43
|
|
32
|
-
|
44
|
+
parser = Rika::Parser.new('document.pdf')
|
33
45
|
|
34
|
-
|
35
|
-
|
46
|
+
# Return the content of the document:
|
47
|
+
parser.content
|
36
48
|
|
37
|
-
|
38
|
-
|
39
|
-
|
49
|
+
# Return the media type for the document:
|
50
|
+
parser.media_type
|
51
|
+
=> "application/pdf"
|
40
52
|
|
41
|
-
|
42
|
-
|
53
|
+
# Return the metadata field title if it exists:
|
54
|
+
parser.metadata["title"] if parser.metadata_exists?("title")
|
43
55
|
|
44
|
-
|
45
|
-
|
56
|
+
# Return all the available metadata keys that can be read from the document
|
57
|
+
parser.available_metadata
|
46
58
|
|
47
|
-
|
48
|
-
|
49
|
-
|
59
|
+
# Return only the first 10000 chars of the content:
|
60
|
+
parser = Rika::Parser.new('document.pdf', 10000)
|
61
|
+
parser.content # 10000 first chars returned
|
50
62
|
|
51
|
-
|
52
|
-
|
53
|
-
|
63
|
+
# Return content from URL
|
64
|
+
parser = Rika::Parser.new('http://riakhandbook.com/sample.pdf', 200)
|
65
|
+
parser.content
|
54
66
|
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
67
|
+
# Return the language for the content
|
68
|
+
parser = parser = Rika::Parser.new('german document.pdf')
|
69
|
+
parser.language
|
70
|
+
=> "de"
|
59
71
|
|
60
|
-
|
61
|
-
|
72
|
+
# Check whether the langugage identification is certain enough to be trusted
|
73
|
+
parser.language_is_reasonably_certain?
|
62
74
|
|
63
75
|
```
|
76
|
+
|
77
|
+
## Credits
|
78
|
+
The following people have contributed ideas, documentation, or code to Rika:
|
79
|
+
* Keith Bennett
|
80
|
+
* Richard Nyström
|
81
|
+
|
64
82
|
## Contributing
|
65
83
|
|
66
84
|
1. Fork it
|
data/lib/rika.rb
CHANGED
@@ -18,7 +18,23 @@ module Rika
|
|
18
18
|
import org.apache.tika.language.LanguageIdentifier
|
19
19
|
import java.io.FileInputStream
|
20
20
|
import java.net.URL
|
21
|
-
|
21
|
+
|
22
|
+
def self.parse_content_and_metadata(file_location, max_content_length = -1)
|
23
|
+
parser = Parser.new(file_location, max_content_length)
|
24
|
+
[parser.content, parser.metadata]
|
25
|
+
end
|
26
|
+
|
27
|
+
def self.parse_content(file_location, max_content_length = -1)
|
28
|
+
parser = Parser.new(file_location, max_content_length)
|
29
|
+
parser.content
|
30
|
+
end
|
31
|
+
|
32
|
+
def self.parse_metadata(file_location)
|
33
|
+
parser = Parser.new(file_location, 0)
|
34
|
+
parser.metadata
|
35
|
+
end
|
36
|
+
|
37
|
+
|
22
38
|
class Parser
|
23
39
|
|
24
40
|
def initialize(file_location, max_content_length = -1)
|
data/lib/rika/version.rb
CHANGED
data/spec/rika_spec.rb
CHANGED
@@ -21,6 +21,7 @@ describe Rika::Parser do
|
|
21
21
|
:AccessLog => [], :Logger => WEBrick::Log::new("/dev/null", 7))
|
22
22
|
@server.start
|
23
23
|
end
|
24
|
+
@sample_pdf_filespec = file_path("document.pdf")
|
24
25
|
end
|
25
26
|
|
26
27
|
after(:all) do
|
@@ -173,4 +174,20 @@ describe Rika::Parser do
|
|
173
174
|
lang.language_is_reasonably_certain? == true
|
174
175
|
end
|
175
176
|
end
|
176
|
-
|
177
|
+
|
178
|
+
it "should return valid content using Rika.parse_content" do
|
179
|
+
content = Rika.parse_content(@sample_pdf_filespec)
|
180
|
+
(content.should be_a(String)) && (content.should_not be_empty)
|
181
|
+
end
|
182
|
+
|
183
|
+
it "should return valid metadata using Rika.parse_metadata" do
|
184
|
+
metadata = Rika.parse_metadata(@sample_pdf_filespec)
|
185
|
+
(metadata.should be_a(Hash)) && (metadata.should_not be_empty)
|
186
|
+
end
|
187
|
+
|
188
|
+
it "should return valid content and metadata using Rika.parse_content_and_metadata" do
|
189
|
+
content, metadata = Rika.parse_content_and_metadata(@sample_pdf_filespec)
|
190
|
+
(content.should be_a(String)) && (content.should_not be_empty) && \
|
191
|
+
(metadata.should be_a(Hash)) && (metadata.should_not be_empty)
|
192
|
+
end
|
193
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rika
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.1.1
|
5
5
|
prerelease:
|
6
6
|
platform: java
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-02-
|
12
|
+
date: 2013-02-26 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rspec
|
@@ -43,17 +43,17 @@ dependencies:
|
|
43
43
|
none: false
|
44
44
|
prerelease: false
|
45
45
|
type: :development
|
46
|
-
description:
|
47
|
-
|
46
|
+
description: " A JRuby wrapper for Apache Tika to extract text and metadata from various\
|
47
|
+
\ file formats. "
|
48
48
|
email:
|
49
49
|
- ricny046@gmail.com
|
50
50
|
executables: []
|
51
51
|
extensions: []
|
52
52
|
extra_rdoc_files: []
|
53
53
|
files:
|
54
|
-
- .gitignore
|
55
|
-
- .rspec
|
56
|
-
- .travis.yml
|
54
|
+
- ".gitignore"
|
55
|
+
- ".rspec"
|
56
|
+
- ".travis.yml"
|
57
57
|
- Gemfile
|
58
58
|
- LICENSE.txt
|
59
59
|
- README.md
|
@@ -123,14 +123,14 @@ require_paths:
|
|
123
123
|
- lib
|
124
124
|
required_ruby_version: !ruby/object:Gem::Requirement
|
125
125
|
requirements:
|
126
|
-
- -
|
126
|
+
- - ">="
|
127
127
|
- !ruby/object:Gem::Version
|
128
128
|
version: !binary |-
|
129
129
|
MA==
|
130
130
|
none: false
|
131
131
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
132
132
|
requirements:
|
133
|
-
- -
|
133
|
+
- - ">="
|
134
134
|
- !ruby/object:Gem::Version
|
135
135
|
version: !binary |-
|
136
136
|
MA==
|