rika 0.9.7-java → 0.9.8-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/rika.rb +30 -22
- data/lib/rika/version.rb +1 -1
- data/spec/rika_spec.rb +8 -8
- metadata +2 -2
data/lib/rika.rb
CHANGED
@@ -11,7 +11,7 @@ Dir[File.join(File.dirname(__FILE__), "../target/dependency/*.jar")].each do |ja
|
|
11
11
|
require jar
|
12
12
|
end
|
13
13
|
|
14
|
-
# Heavily based on the Apache Tika API: http://tika.apache.org/1.
|
14
|
+
# Heavily based on the Apache Tika API: http://tika.apache.org/1.3/api/org/apache/tika/Tika.html
|
15
15
|
module Rika
|
16
16
|
import org.apache.tika.metadata.Metadata
|
17
17
|
import org.apache.tika.Tika
|
@@ -24,14 +24,9 @@ module Rika
|
|
24
24
|
@uri = file_location
|
25
25
|
@tika = Tika.new
|
26
26
|
@tika.set_max_string_length(max_content_length)
|
27
|
-
@
|
28
|
-
|
29
|
-
@
|
30
|
-
is_http = URI(@uri).scheme == "http" && Net::HTTP.get_response(URI(@uri)).is_a?(Net::HTTPSuccess) if !@is_file
|
31
|
-
|
32
|
-
if !@is_file && !is_http
|
33
|
-
raise IOError, "File does not exist or can't be reached."
|
34
|
-
end
|
27
|
+
@metadata_java = Metadata.new
|
28
|
+
@metadata_ruby = nil
|
29
|
+
@input_type = get_input_type
|
35
30
|
end
|
36
31
|
|
37
32
|
def content
|
@@ -40,14 +35,15 @@ module Rika
|
|
40
35
|
end
|
41
36
|
|
42
37
|
def metadata
|
43
|
-
|
44
|
-
|
38
|
+
unless @metadata_ruby
|
39
|
+
self.parse
|
40
|
+
@metadata_ruby = {}
|
45
41
|
|
46
|
-
|
47
|
-
|
42
|
+
@metadata_java.names.each do |name|
|
43
|
+
@metadata_ruby[name] = @metadata_java.get(name)
|
44
|
+
end
|
48
45
|
end
|
49
|
-
|
50
|
-
metadata_hash
|
46
|
+
@metadata_ruby
|
51
47
|
end
|
52
48
|
|
53
49
|
def media_type
|
@@ -55,25 +51,37 @@ module Rika
|
|
55
51
|
end
|
56
52
|
|
57
53
|
def available_metadata
|
58
|
-
|
59
|
-
@metadata.names.to_a
|
54
|
+
metadata.keys
|
60
55
|
end
|
61
56
|
|
62
57
|
def metadata_exists?(name)
|
63
|
-
|
64
|
-
|
58
|
+
metadata[name] != nil
|
59
|
+
end
|
60
|
+
|
61
|
+
def file?
|
62
|
+
@input_type == :file
|
65
63
|
end
|
66
64
|
|
67
65
|
protected
|
68
66
|
|
69
67
|
def parse
|
70
|
-
@content ||= @tika.parse_to_string(input_stream, @
|
68
|
+
@content ||= @tika.parse_to_string(input_stream, @metadata_java).to_s.strip
|
69
|
+
end
|
70
|
+
|
71
|
+
def get_input_type
|
72
|
+
if File.exists?(@uri) && File.directory?(@uri) == false
|
73
|
+
:file
|
74
|
+
elsif URI(@uri).scheme == "http" && Net::HTTP.get_response(URI(@uri)).is_a?(Net::HTTPSuccess)
|
75
|
+
:http
|
76
|
+
else
|
77
|
+
raise IOError, "Input (#{@uri}) is neither file nor http."
|
78
|
+
end
|
71
79
|
end
|
72
80
|
|
73
81
|
def input_stream
|
74
|
-
if
|
82
|
+
if file?
|
75
83
|
FileInputStream.new(java.io.File.new(@uri))
|
76
|
-
else
|
84
|
+
else # :http
|
77
85
|
URL.new(@uri).open_stream
|
78
86
|
end
|
79
87
|
end
|
data/lib/rika/version.rb
CHANGED
data/spec/rika_spec.rb
CHANGED
@@ -15,7 +15,7 @@ describe Rika::Parser do
|
|
15
15
|
@dir = File.expand_path(File.join(File.dirname(__FILE__), 'fixtures'))
|
16
16
|
port = 50505
|
17
17
|
@url = "http://#{Socket.gethostname}:#{port}"
|
18
|
-
@
|
18
|
+
@quote = "First they ignore you, then they ridicule you, then they fight you, then you win."
|
19
19
|
@t1 = Thread.new do
|
20
20
|
@server = HTTPServer.new(:Port => port, :DocumentRoot => @dir,
|
21
21
|
:AccessLog => [], :Logger => WEBrick::Log::new("/dev/null", 7))
|
@@ -28,11 +28,11 @@ describe Rika::Parser do
|
|
28
28
|
end
|
29
29
|
|
30
30
|
it "should raise error if file does not exists" do
|
31
|
-
lambda { Rika::Parser.new(file_path("nonsense.txt")) }.should raise_error(IOError
|
31
|
+
lambda { Rika::Parser.new(file_path("nonsense.txt")) }.should raise_error(IOError)
|
32
32
|
end
|
33
33
|
|
34
34
|
it "should raise error if URL does not exists" do
|
35
|
-
lambda { Rika::Parser.new("http://nonsense.com/whatever.pdf") }.should raise_error(IOError
|
35
|
+
lambda { Rika::Parser.new("http://nonsense.com/whatever.pdf") }.should raise_error(IOError)
|
36
36
|
end
|
37
37
|
|
38
38
|
it "should detect file type without a file extension" do
|
@@ -41,20 +41,20 @@ describe Rika::Parser do
|
|
41
41
|
end
|
42
42
|
|
43
43
|
it "should not be possible to trick the parser to read a folder with an extension" do
|
44
|
-
lambda { Rika::Parser.new(file_path("folder.js")).content }.should raise_error(IOError
|
44
|
+
lambda { Rika::Parser.new(file_path("folder.js")).content }.should raise_error(IOError)
|
45
45
|
end
|
46
46
|
|
47
47
|
describe '#content' do
|
48
48
|
it "should return the content in a text file" do
|
49
|
-
@txt_parser.content.strip.should == @
|
49
|
+
@txt_parser.content.strip.should == @quote
|
50
50
|
end
|
51
51
|
|
52
52
|
it "should return the content in a docx file" do
|
53
|
-
@docx_parser.content.should == @
|
53
|
+
@docx_parser.content.should == @quote
|
54
54
|
end
|
55
55
|
|
56
56
|
it "should return the content in a pdf file" do
|
57
|
-
@pdf_parser.content.should == @
|
57
|
+
@pdf_parser.content.should == @quote
|
58
58
|
end
|
59
59
|
|
60
60
|
it "should return no content for an image" do
|
@@ -78,7 +78,7 @@ describe Rika::Parser do
|
|
78
78
|
|
79
79
|
it "should return the content from a file over http" do
|
80
80
|
parser = Rika::Parser.new(@url + "/document.pdf")
|
81
|
-
parser.content.should == @
|
81
|
+
parser.content.should == @quote
|
82
82
|
end
|
83
83
|
|
84
84
|
it "should return empty string for unknown file" do
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rika
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.9.
|
4
|
+
version: 0.9.8
|
5
5
|
prerelease:
|
6
6
|
platform: java
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-01-
|
12
|
+
date: 2013-01-25 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rspec
|