rika 1.0.0-java → 1.1.1-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +2 -0
- data/README.md +42 -24
- data/lib/rika.rb +17 -1
- data/lib/rika/version.rb +1 -1
- data/spec/rika_spec.rb +18 -1
- metadata +9 -9
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -25,42 +25,60 @@ Or install it yourself as:
|
|
25
25
|
|
26
26
|
## Usage
|
27
27
|
|
28
|
-
|
28
|
+
For a quick start with the simplest use cases, the following functions
|
29
|
+
are provided to get what you need in a single function call, for your convenience:
|
30
|
+
|
31
|
+
```ruby
|
32
|
+
require 'rika'
|
33
|
+
|
34
|
+
content = Rika.parse_content('document.pdf') # string containing all content text
|
35
|
+
metadata = Rika.parse_metadata('document.pdf') # hash containing the document metadata
|
36
|
+
content, metadata = Rika.parse_content_and_metadata('document.pdf') # both of the above
|
37
|
+
```
|
38
|
+
|
39
|
+
For other use cases and finer control, you can work directly with the Rika::Parser object:
|
40
|
+
|
29
41
|
```ruby
|
30
|
-
|
42
|
+
require 'rika'
|
31
43
|
|
32
|
-
|
44
|
+
parser = Rika::Parser.new('document.pdf')
|
33
45
|
|
34
|
-
|
35
|
-
|
46
|
+
# Return the content of the document:
|
47
|
+
parser.content
|
36
48
|
|
37
|
-
|
38
|
-
|
39
|
-
|
49
|
+
# Return the media type for the document:
|
50
|
+
parser.media_type
|
51
|
+
=> "application/pdf"
|
40
52
|
|
41
|
-
|
42
|
-
|
53
|
+
# Return the metadata field title if it exists:
|
54
|
+
parser.metadata["title"] if parser.metadata_exists?("title")
|
43
55
|
|
44
|
-
|
45
|
-
|
56
|
+
# Return all the available metadata keys that can be read from the document
|
57
|
+
parser.available_metadata
|
46
58
|
|
47
|
-
|
48
|
-
|
49
|
-
|
59
|
+
# Return only the first 10000 chars of the content:
|
60
|
+
parser = Rika::Parser.new('document.pdf', 10000)
|
61
|
+
parser.content # 10000 first chars returned
|
50
62
|
|
51
|
-
|
52
|
-
|
53
|
-
|
63
|
+
# Return content from URL
|
64
|
+
parser = Rika::Parser.new('http://riakhandbook.com/sample.pdf', 200)
|
65
|
+
parser.content
|
54
66
|
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
67
|
+
# Return the language for the content
|
68
|
+
parser = parser = Rika::Parser.new('german document.pdf')
|
69
|
+
parser.language
|
70
|
+
=> "de"
|
59
71
|
|
60
|
-
|
61
|
-
|
72
|
+
# Check whether the langugage identification is certain enough to be trusted
|
73
|
+
parser.language_is_reasonably_certain?
|
62
74
|
|
63
75
|
```
|
76
|
+
|
77
|
+
## Credits
|
78
|
+
The following people have contributed ideas, documentation, or code to Rika:
|
79
|
+
* Keith Bennett
|
80
|
+
* Richard Nyström
|
81
|
+
|
64
82
|
## Contributing
|
65
83
|
|
66
84
|
1. Fork it
|
data/lib/rika.rb
CHANGED
@@ -18,7 +18,23 @@ module Rika
|
|
18
18
|
import org.apache.tika.language.LanguageIdentifier
|
19
19
|
import java.io.FileInputStream
|
20
20
|
import java.net.URL
|
21
|
-
|
21
|
+
|
22
|
+
def self.parse_content_and_metadata(file_location, max_content_length = -1)
|
23
|
+
parser = Parser.new(file_location, max_content_length)
|
24
|
+
[parser.content, parser.metadata]
|
25
|
+
end
|
26
|
+
|
27
|
+
def self.parse_content(file_location, max_content_length = -1)
|
28
|
+
parser = Parser.new(file_location, max_content_length)
|
29
|
+
parser.content
|
30
|
+
end
|
31
|
+
|
32
|
+
def self.parse_metadata(file_location)
|
33
|
+
parser = Parser.new(file_location, 0)
|
34
|
+
parser.metadata
|
35
|
+
end
|
36
|
+
|
37
|
+
|
22
38
|
class Parser
|
23
39
|
|
24
40
|
def initialize(file_location, max_content_length = -1)
|
data/lib/rika/version.rb
CHANGED
data/spec/rika_spec.rb
CHANGED
@@ -21,6 +21,7 @@ describe Rika::Parser do
|
|
21
21
|
:AccessLog => [], :Logger => WEBrick::Log::new("/dev/null", 7))
|
22
22
|
@server.start
|
23
23
|
end
|
24
|
+
@sample_pdf_filespec = file_path("document.pdf")
|
24
25
|
end
|
25
26
|
|
26
27
|
after(:all) do
|
@@ -173,4 +174,20 @@ describe Rika::Parser do
|
|
173
174
|
lang.language_is_reasonably_certain? == true
|
174
175
|
end
|
175
176
|
end
|
176
|
-
|
177
|
+
|
178
|
+
it "should return valid content using Rika.parse_content" do
|
179
|
+
content = Rika.parse_content(@sample_pdf_filespec)
|
180
|
+
(content.should be_a(String)) && (content.should_not be_empty)
|
181
|
+
end
|
182
|
+
|
183
|
+
it "should return valid metadata using Rika.parse_metadata" do
|
184
|
+
metadata = Rika.parse_metadata(@sample_pdf_filespec)
|
185
|
+
(metadata.should be_a(Hash)) && (metadata.should_not be_empty)
|
186
|
+
end
|
187
|
+
|
188
|
+
it "should return valid content and metadata using Rika.parse_content_and_metadata" do
|
189
|
+
content, metadata = Rika.parse_content_and_metadata(@sample_pdf_filespec)
|
190
|
+
(content.should be_a(String)) && (content.should_not be_empty) && \
|
191
|
+
(metadata.should be_a(Hash)) && (metadata.should_not be_empty)
|
192
|
+
end
|
193
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rika
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.1.1
|
5
5
|
prerelease:
|
6
6
|
platform: java
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-02-
|
12
|
+
date: 2013-02-26 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rspec
|
@@ -43,17 +43,17 @@ dependencies:
|
|
43
43
|
none: false
|
44
44
|
prerelease: false
|
45
45
|
type: :development
|
46
|
-
description:
|
47
|
-
|
46
|
+
description: " A JRuby wrapper for Apache Tika to extract text and metadata from various\
|
47
|
+
\ file formats. "
|
48
48
|
email:
|
49
49
|
- ricny046@gmail.com
|
50
50
|
executables: []
|
51
51
|
extensions: []
|
52
52
|
extra_rdoc_files: []
|
53
53
|
files:
|
54
|
-
- .gitignore
|
55
|
-
- .rspec
|
56
|
-
- .travis.yml
|
54
|
+
- ".gitignore"
|
55
|
+
- ".rspec"
|
56
|
+
- ".travis.yml"
|
57
57
|
- Gemfile
|
58
58
|
- LICENSE.txt
|
59
59
|
- README.md
|
@@ -123,14 +123,14 @@ require_paths:
|
|
123
123
|
- lib
|
124
124
|
required_ruby_version: !ruby/object:Gem::Requirement
|
125
125
|
requirements:
|
126
|
-
- -
|
126
|
+
- - ">="
|
127
127
|
- !ruby/object:Gem::Version
|
128
128
|
version: !binary |-
|
129
129
|
MA==
|
130
130
|
none: false
|
131
131
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
132
132
|
requirements:
|
133
|
-
- -
|
133
|
+
- - ">="
|
134
134
|
- !ruby/object:Gem::Version
|
135
135
|
version: !binary |-
|
136
136
|
MA==
|