rika 0.9.3-java → 0.9.4-java

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -31,6 +31,10 @@ Something like this:
31
31
  # Return the content of the document:
32
32
  parser.content
33
33
 
34
+ # Return the media type for the document:
35
+ parser.media_type
36
+ => "application/pdf"
37
+
34
38
  # Return the metadata field title if it exists:
35
39
  parser.metadata["title"] if parser.metadata_exists?("title")
36
40
 
@@ -42,7 +46,7 @@ Something like this:
42
46
  parser.content # 10000 first chars returned
43
47
 
44
48
  # Return content from URL
45
- parser = Rika::Parser.new('http://www.exampleurl.com/example.pdf')
49
+ parser = Rika::Parser.new('http://riakhandbook.com/sample.pdf', 200)
46
50
  parser.content
47
51
  ```
48
52
  ## Contributing
@@ -46,6 +46,10 @@ module Rika
46
46
  metadata_hash
47
47
  end
48
48
 
49
+ def media_type
50
+ @media_type
51
+ end
52
+
49
53
  def available_metadata
50
54
  @metadata.names.to_a
51
55
  end
@@ -59,6 +63,7 @@ module Rika
59
63
  def parse_file
60
64
  input_stream = java.io.FileInputStream.new(java.io.File.new(@uri))
61
65
  @metadata.set("filename", File.basename(@uri))
66
+ @media_type = @tika.detect(java.io.FileInputStream.new(java.io.File.new(@uri)))
62
67
  @content = @tika.parse_to_string(input_stream, @metadata)
63
68
  end
64
69
 
@@ -66,6 +71,7 @@ module Rika
66
71
  raise IOError, "File does not exist or can't be reached." if not Net::HTTP.get_response(URI(@uri)).is_a?(Net::HTTPSuccess)
67
72
  url = java.net.URL.new(@uri)
68
73
  input_stream = url.open_stream
74
+ @media_type = @tika.detect(url.open_stream)
69
75
  @metadata.set("url", @uri)
70
76
  @content = @tika.parse_to_string(input_stream, @metadata)
71
77
  end
@@ -1,3 +1,3 @@
1
1
  module Rika
2
- VERSION = "0.9.3"
2
+ VERSION = "0.9.4"
3
3
  end
Binary file
@@ -11,10 +11,11 @@ describe Rika::Parser do
11
11
  @docx_parser = Rika::Parser.new(file_path("document.docx"))
12
12
  @pdf_parser = Rika::Parser.new(file_path("document.pdf"))
13
13
  @image_parser = Rika::Parser.new(file_path("image.jpg"))
14
+ @unknown_parser = Rika::Parser.new(file_path("unknown.bin"))
14
15
  @dir = File.expand_path(File.join(File.dirname(__FILE__), 'fixtures'))
15
16
  port = 50505
16
17
  @url = "http://#{Socket.gethostname}:#{port}"
17
-
18
+ @qoute = "First they ignore you, then they ridicule you, then they fight you, then you win."
18
19
  @t1 = Thread.new do
19
20
  @server = HTTPServer.new(:Port => port, :DocumentRoot => @dir,
20
21
  :AccessLog => [], :Logger => WEBrick::Log::new("/dev/null", 7))
@@ -41,15 +42,15 @@ describe Rika::Parser do
41
42
 
42
43
  describe '#content' do
43
44
  it "should return the content in a text file" do
44
- @txt_parser.content.strip.should == "First they ignore you, then they ridicule you, then they fight you, then you win."
45
+ @txt_parser.content.strip.should == @qoute
45
46
  end
46
47
 
47
48
  it "should return the content in a docx file" do
48
- @docx_parser.content.should == "First they ignore you, then they ridicule you, then they fight you, then you win."
49
+ @docx_parser.content.should == @qoute
49
50
  end
50
51
 
51
52
  it "should return the content in a pdf file" do
52
- @pdf_parser.content.should == "First they ignore you, then they ridicule you, then they fight you, then you win."
53
+ @pdf_parser.content.should == @qoute
53
54
  end
54
55
 
55
56
  it "should return no content for an image" do
@@ -73,7 +74,11 @@ describe Rika::Parser do
73
74
 
74
75
  it "should return the content from a file over http" do
75
76
  parser = Rika::Parser.new(@url + "/document.pdf")
76
- parser.content.should == "First they ignore you, then they ridicule you, then they fight you, then you win."
77
+ parser.content.should == @qoute
78
+ end
79
+
80
+ it "should return empty string for unknown file" do
81
+ @unknown_parser.content.should be_empty
77
82
  end
78
83
  end
79
84
 
@@ -127,4 +132,23 @@ describe Rika::Parser do
127
132
  @docx_parser.metadata_exists?("title").should == true
128
133
  end
129
134
  end
135
+
136
+ describe '#media_type' do
137
+ it "should return application/pdf for a pdf file" do
138
+ @pdf_parser.media_type.should == "application/pdf"
139
+ end
140
+
141
+ it "should return text/plain for a txt file" do
142
+ @txt_parser.media_type.should == "text/plain"
143
+ end
144
+
145
+ it "should return application/pdf for a pdf over http" do
146
+ parser = Rika::Parser.new(@url + "/document.pdf")
147
+ parser.media_type.should == "application/pdf"
148
+ end
149
+
150
+ it "should return application/octet-stream for unknown file" do
151
+ @unknown_parser.media_type.should == "application/octet-stream"
152
+ end
153
+ end
130
154
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rika
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.3
4
+ version: 0.9.4
5
5
  prerelease:
6
6
  platform: java
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-09-23 00:00:00.000000000 Z
12
+ date: 2012-09-25 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rspec
@@ -54,6 +54,7 @@ files:
54
54
  - spec/fixtures/over_100k_file.txt
55
55
  - spec/fixtures/text_file.txt
56
56
  - spec/fixtures/text_file_without_extension
57
+ - spec/fixtures/unknown.bin
57
58
  - spec/rika_spec.rb
58
59
  - spec/spec_helper.rb
59
60
  - target/dependency/apache-mime4j-core-0.7.2.jar
@@ -124,5 +125,6 @@ test_files:
124
125
  - spec/fixtures/over_100k_file.txt
125
126
  - spec/fixtures/text_file.txt
126
127
  - spec/fixtures/text_file_without_extension
128
+ - spec/fixtures/unknown.bin
127
129
  - spec/rika_spec.rb
128
130
  - spec/spec_helper.rb