rika 0.9.2-java → 0.9.3-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +5 -1
- data/lib/rika.rb +29 -22
- data/lib/rika/version.rb +1 -1
- data/spec/rika_spec.rb +36 -1
- data/spec/spec_helper.rb +1 -0
- metadata +2 -2
data/README.md
CHANGED
@@ -39,7 +39,11 @@ Something like this:
|
|
39
39
|
|
40
40
|
# Return only the first 10000 chars of the content:
|
41
41
|
parser = Rika::Parser.new('document.pdf', 10000)
|
42
|
-
parser.content # 10000 first chars returned
|
42
|
+
parser.content # 10000 first chars returned
|
43
|
+
|
44
|
+
# Return content from URL
|
45
|
+
parser = Rika::Parser.new('http://www.exampleurl.com/example.pdf')
|
46
|
+
parser.content
|
43
47
|
```
|
44
48
|
## Contributing
|
45
49
|
|
data/lib/rika.rb
CHANGED
@@ -1,24 +1,34 @@
|
|
1
1
|
raise "You need to run JRuby to use Rika" unless RUBY_PLATFORM =~ /java/
|
2
2
|
|
3
3
|
require "rika/version"
|
4
|
+
require 'uri'
|
5
|
+
require 'net/http'
|
4
6
|
require 'java'
|
5
7
|
|
6
8
|
Dir[File.join(File.dirname(__FILE__), "../target/dependency/*.jar")].each do |jar|
|
7
9
|
require jar
|
8
10
|
end
|
9
11
|
|
12
|
+
# Heavily based on the Apache Tika API: http://tika.apache.org/1.2/api/org/apache/tika/Tika.html
|
10
13
|
module Rika
|
11
14
|
import org.apache.tika.metadata.Metadata
|
12
15
|
import org.apache.tika.Tika
|
16
|
+
|
13
17
|
class Parser
|
14
18
|
|
15
|
-
def initialize(
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
19
|
+
def initialize(uri, max_content_length = -1)
|
20
|
+
p = URI::Parser.new
|
21
|
+
@uri = uri
|
22
|
+
@tika = Tika.new
|
23
|
+
@tika.set_max_string_length(max_content_length)
|
24
|
+
@metadata = Metadata.new
|
25
|
+
|
26
|
+
if File.exists?(@uri)
|
27
|
+
self.parse_file
|
28
|
+
elsif p.parse(@uri).scheme == 'http' || p.parse(@uri).scheme == 'https'
|
29
|
+
self.parse_url
|
20
30
|
else
|
21
|
-
raise IOError, "File does not exist"
|
31
|
+
raise IOError, "File does not exist or can't be reached."
|
22
32
|
end
|
23
33
|
end
|
24
34
|
|
@@ -41,26 +51,23 @@ module Rika
|
|
41
51
|
end
|
42
52
|
|
43
53
|
def metadata_exists?(name)
|
44
|
-
|
45
|
-
false
|
46
|
-
else
|
47
|
-
true
|
48
|
-
end
|
54
|
+
@metadata.get(name) != nil
|
49
55
|
end
|
50
56
|
|
51
57
|
protected
|
52
58
|
|
53
|
-
def
|
54
|
-
input_stream =
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
59
|
+
def parse_file
|
60
|
+
input_stream = java.io.FileInputStream.new(java.io.File.new(@uri))
|
61
|
+
@metadata.set("filename", File.basename(@uri))
|
62
|
+
@content = @tika.parse_to_string(input_stream, @metadata)
|
63
|
+
end
|
64
|
+
|
65
|
+
def parse_url
|
66
|
+
raise IOError, "File does not exist or can't be reached." if not Net::HTTP.get_response(URI(@uri)).is_a?(Net::HTTPSuccess)
|
67
|
+
url = java.net.URL.new(@uri)
|
68
|
+
input_stream = url.open_stream
|
69
|
+
@metadata.set("url", @uri)
|
70
|
+
@content = @tika.parse_to_string(input_stream, @metadata)
|
64
71
|
end
|
65
72
|
end
|
66
73
|
end
|
data/lib/rika/version.rb
CHANGED
data/spec/rika_spec.rb
CHANGED
@@ -1,17 +1,37 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
|
3
3
|
require 'spec_helper'
|
4
|
+
require 'webrick'
|
4
5
|
|
6
|
+
include WEBrick
|
7
|
+
|
5
8
|
describe Rika::Parser do
|
6
9
|
before(:all) do
|
7
10
|
@txt_parser = Rika::Parser.new(file_path("text_file.txt"))
|
8
11
|
@docx_parser = Rika::Parser.new(file_path("document.docx"))
|
9
12
|
@pdf_parser = Rika::Parser.new(file_path("document.pdf"))
|
10
13
|
@image_parser = Rika::Parser.new(file_path("image.jpg"))
|
14
|
+
@dir = File.expand_path(File.join(File.dirname(__FILE__), 'fixtures'))
|
15
|
+
port = 50505
|
16
|
+
@url = "http://#{Socket.gethostname}:#{port}"
|
17
|
+
|
18
|
+
@t1 = Thread.new do
|
19
|
+
@server = HTTPServer.new(:Port => port, :DocumentRoot => @dir,
|
20
|
+
:AccessLog => [], :Logger => WEBrick::Log::new("/dev/null", 7))
|
21
|
+
@server.start
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
after(:all) do
|
26
|
+
@t1.exit
|
11
27
|
end
|
12
28
|
|
13
29
|
it "should raise error if file does not exists" do
|
14
|
-
lambda { Rika::Parser.new(file_path("nonsense.txt")) }.should raise_error(IOError, "File does not exist")
|
30
|
+
lambda { Rika::Parser.new(file_path("nonsense.txt")) }.should raise_error(IOError, "File does not exist or can't be reached.")
|
31
|
+
end
|
32
|
+
|
33
|
+
it "should raise error if URL does not exists" do
|
34
|
+
lambda { Rika::Parser.new("http://nonsense.com/whatever.pdf") }.should raise_error(IOError, "File does not exist or can't be reached.")
|
15
35
|
end
|
16
36
|
|
17
37
|
it "should detect file type without a file extension" do
|
@@ -41,10 +61,20 @@ describe Rika::Parser do
|
|
41
61
|
parser.content.should == "First"
|
42
62
|
end
|
43
63
|
|
64
|
+
it "should only return max content length for file over http" do
|
65
|
+
parser = Rika::Parser.new(@url + "/document.pdf", 6)
|
66
|
+
parser.content.should == "First"
|
67
|
+
end
|
68
|
+
|
44
69
|
it "should be possible to read files over 100k by default" do
|
45
70
|
parser = Rika::Parser.new(file_path("over_100k_file.txt"))
|
46
71
|
parser.content.length.should == 101_761
|
47
72
|
end
|
73
|
+
|
74
|
+
it "should return the content from a file over http" do
|
75
|
+
parser = Rika::Parser.new(@url + "/document.pdf")
|
76
|
+
parser.content.should == "First they ignore you, then they ridicule you, then they fight you, then you win."
|
77
|
+
end
|
48
78
|
end
|
49
79
|
|
50
80
|
# We just test a few of the metadata fields for some common file formats
|
@@ -67,6 +97,11 @@ describe Rika::Parser do
|
|
67
97
|
@pdf_parser.metadata["title"].should == "A simple title"
|
68
98
|
end
|
69
99
|
|
100
|
+
it "should return metadata from a file over http" do
|
101
|
+
parser = Rika::Parser.new(@url + "/document.pdf")
|
102
|
+
parser.metadata["title"].should == "A simple title"
|
103
|
+
end
|
104
|
+
|
70
105
|
it "should return metadata from an image" do
|
71
106
|
@image_parser.metadata["Image Height"].should == "72 pixels"
|
72
107
|
@image_parser.metadata["Image Width"].should == "72 pixels"
|
data/spec/spec_helper.rb
CHANGED
@@ -3,6 +3,7 @@ require "rika"
|
|
3
3
|
def file_path( *paths )
|
4
4
|
File.expand_path(File.join(File.dirname(__FILE__), 'fixtures', *paths))
|
5
5
|
end
|
6
|
+
|
6
7
|
# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
|
7
8
|
RSpec.configure do |config|
|
8
9
|
config.treat_symbols_as_metadata_keys_with_true_values = true
|
metadata
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rika
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.9.3
|
4
5
|
prerelease:
|
5
|
-
version: 0.9.2
|
6
6
|
platform: java
|
7
7
|
authors:
|
8
8
|
- Richard Nyström
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-09-
|
12
|
+
date: 2012-09-23 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rspec
|