rika 0.9.3-java → 0.9.4-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +5 -1
- data/lib/rika.rb +6 -0
- data/lib/rika/version.rb +1 -1
- data/spec/fixtures/unknown.bin +0 -0
- data/spec/rika_spec.rb +29 -5
- metadata +4 -2
data/README.md
CHANGED
@@ -31,6 +31,10 @@ Something like this:
|
|
31
31
|
# Return the content of the document:
|
32
32
|
parser.content
|
33
33
|
|
34
|
+
# Return the media type for the document:
|
35
|
+
parser.media_type
|
36
|
+
=> "application/pdf"
|
37
|
+
|
34
38
|
# Return the metadata field title if it exists:
|
35
39
|
parser.metadata["title"] if parser.metadata_exists?("title")
|
36
40
|
|
@@ -42,7 +46,7 @@ Something like this:
|
|
42
46
|
parser.content # 10000 first chars returned
|
43
47
|
|
44
48
|
# Return content from URL
|
45
|
-
parser = Rika::Parser.new('http://
|
49
|
+
parser = Rika::Parser.new('http://riakhandbook.com/sample.pdf', 200)
|
46
50
|
parser.content
|
47
51
|
```
|
48
52
|
## Contributing
|
data/lib/rika.rb
CHANGED
@@ -46,6 +46,10 @@ module Rika
|
|
46
46
|
metadata_hash
|
47
47
|
end
|
48
48
|
|
49
|
+
def media_type
|
50
|
+
@media_type
|
51
|
+
end
|
52
|
+
|
49
53
|
def available_metadata
|
50
54
|
@metadata.names.to_a
|
51
55
|
end
|
@@ -59,6 +63,7 @@ module Rika
|
|
59
63
|
def parse_file
|
60
64
|
input_stream = java.io.FileInputStream.new(java.io.File.new(@uri))
|
61
65
|
@metadata.set("filename", File.basename(@uri))
|
66
|
+
@media_type = @tika.detect(java.io.FileInputStream.new(java.io.File.new(@uri)))
|
62
67
|
@content = @tika.parse_to_string(input_stream, @metadata)
|
63
68
|
end
|
64
69
|
|
@@ -66,6 +71,7 @@ module Rika
|
|
66
71
|
raise IOError, "File does not exist or can't be reached." if not Net::HTTP.get_response(URI(@uri)).is_a?(Net::HTTPSuccess)
|
67
72
|
url = java.net.URL.new(@uri)
|
68
73
|
input_stream = url.open_stream
|
74
|
+
@media_type = @tika.detect(url.open_stream)
|
69
75
|
@metadata.set("url", @uri)
|
70
76
|
@content = @tika.parse_to_string(input_stream, @metadata)
|
71
77
|
end
|
data/lib/rika/version.rb
CHANGED
Binary file
|
data/spec/rika_spec.rb
CHANGED
@@ -11,10 +11,11 @@ describe Rika::Parser do
|
|
11
11
|
@docx_parser = Rika::Parser.new(file_path("document.docx"))
|
12
12
|
@pdf_parser = Rika::Parser.new(file_path("document.pdf"))
|
13
13
|
@image_parser = Rika::Parser.new(file_path("image.jpg"))
|
14
|
+
@unknown_parser = Rika::Parser.new(file_path("unknown.bin"))
|
14
15
|
@dir = File.expand_path(File.join(File.dirname(__FILE__), 'fixtures'))
|
15
16
|
port = 50505
|
16
17
|
@url = "http://#{Socket.gethostname}:#{port}"
|
17
|
-
|
18
|
+
@qoute = "First they ignore you, then they ridicule you, then they fight you, then you win."
|
18
19
|
@t1 = Thread.new do
|
19
20
|
@server = HTTPServer.new(:Port => port, :DocumentRoot => @dir,
|
20
21
|
:AccessLog => [], :Logger => WEBrick::Log::new("/dev/null", 7))
|
@@ -41,15 +42,15 @@ describe Rika::Parser do
|
|
41
42
|
|
42
43
|
describe '#content' do
|
43
44
|
it "should return the content in a text file" do
|
44
|
-
@txt_parser.content.strip.should ==
|
45
|
+
@txt_parser.content.strip.should == @qoute
|
45
46
|
end
|
46
47
|
|
47
48
|
it "should return the content in a docx file" do
|
48
|
-
@docx_parser.content.should ==
|
49
|
+
@docx_parser.content.should == @qoute
|
49
50
|
end
|
50
51
|
|
51
52
|
it "should return the content in a pdf file" do
|
52
|
-
@pdf_parser.content.should ==
|
53
|
+
@pdf_parser.content.should == @qoute
|
53
54
|
end
|
54
55
|
|
55
56
|
it "should return no content for an image" do
|
@@ -73,7 +74,11 @@ describe Rika::Parser do
|
|
73
74
|
|
74
75
|
it "should return the content from a file over http" do
|
75
76
|
parser = Rika::Parser.new(@url + "/document.pdf")
|
76
|
-
parser.content.should ==
|
77
|
+
parser.content.should == @qoute
|
78
|
+
end
|
79
|
+
|
80
|
+
it "should return empty string for unknown file" do
|
81
|
+
@unknown_parser.content.should be_empty
|
77
82
|
end
|
78
83
|
end
|
79
84
|
|
@@ -127,4 +132,23 @@ describe Rika::Parser do
|
|
127
132
|
@docx_parser.metadata_exists?("title").should == true
|
128
133
|
end
|
129
134
|
end
|
135
|
+
|
136
|
+
describe '#media_type' do
|
137
|
+
it "should return application/pdf for a pdf file" do
|
138
|
+
@pdf_parser.media_type.should == "application/pdf"
|
139
|
+
end
|
140
|
+
|
141
|
+
it "should return text/plain for a txt file" do
|
142
|
+
@txt_parser.media_type.should == "text/plain"
|
143
|
+
end
|
144
|
+
|
145
|
+
it "should return application/pdf for a pdf over http" do
|
146
|
+
parser = Rika::Parser.new(@url + "/document.pdf")
|
147
|
+
parser.media_type.should == "application/pdf"
|
148
|
+
end
|
149
|
+
|
150
|
+
it "should return application/octet-stream for unknown file" do
|
151
|
+
@unknown_parser.media_type.should == "application/octet-stream"
|
152
|
+
end
|
153
|
+
end
|
130
154
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rika
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.9.
|
4
|
+
version: 0.9.4
|
5
5
|
prerelease:
|
6
6
|
platform: java
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-09-
|
12
|
+
date: 2012-09-25 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rspec
|
@@ -54,6 +54,7 @@ files:
|
|
54
54
|
- spec/fixtures/over_100k_file.txt
|
55
55
|
- spec/fixtures/text_file.txt
|
56
56
|
- spec/fixtures/text_file_without_extension
|
57
|
+
- spec/fixtures/unknown.bin
|
57
58
|
- spec/rika_spec.rb
|
58
59
|
- spec/spec_helper.rb
|
59
60
|
- target/dependency/apache-mime4j-core-0.7.2.jar
|
@@ -124,5 +125,6 @@ test_files:
|
|
124
125
|
- spec/fixtures/over_100k_file.txt
|
125
126
|
- spec/fixtures/text_file.txt
|
126
127
|
- spec/fixtures/text_file_without_extension
|
128
|
+
- spec/fixtures/unknown.bin
|
127
129
|
- spec/rika_spec.rb
|
128
130
|
- spec/spec_helper.rb
|