rika 0.9.7-java → 0.9.8-java

Sign up to get free protection for your applications and to get access to all the features.
data/lib/rika.rb CHANGED
@@ -11,7 +11,7 @@ Dir[File.join(File.dirname(__FILE__), "../target/dependency/*.jar")].each do |ja
11
11
  require jar
12
12
  end
13
13
 
14
- # Heavily based on the Apache Tika API: http://tika.apache.org/1.2/api/org/apache/tika/Tika.html
14
+ # Heavily based on the Apache Tika API: http://tika.apache.org/1.3/api/org/apache/tika/Tika.html
15
15
  module Rika
16
16
  import org.apache.tika.metadata.Metadata
17
17
  import org.apache.tika.Tika
@@ -24,14 +24,9 @@ module Rika
24
24
  @uri = file_location
25
25
  @tika = Tika.new
26
26
  @tika.set_max_string_length(max_content_length)
27
- @metadata = Metadata.new
28
-
29
- @is_file = File.exists?(@uri) && File.directory?(@uri) == false
30
- is_http = URI(@uri).scheme == "http" && Net::HTTP.get_response(URI(@uri)).is_a?(Net::HTTPSuccess) if !@is_file
31
-
32
- if !@is_file && !is_http
33
- raise IOError, "File does not exist or can't be reached."
34
- end
27
+ @metadata_java = Metadata.new
28
+ @metadata_ruby = nil
29
+ @input_type = get_input_type
35
30
  end
36
31
 
37
32
  def content
@@ -40,14 +35,15 @@ module Rika
40
35
  end
41
36
 
42
37
  def metadata
43
- self.parse
44
- metadata_hash = {}
38
+ unless @metadata_ruby
39
+ self.parse
40
+ @metadata_ruby = {}
45
41
 
46
- @metadata.names.each do |name|
47
- metadata_hash[name] = @metadata.get(name)
42
+ @metadata_java.names.each do |name|
43
+ @metadata_ruby[name] = @metadata_java.get(name)
44
+ end
48
45
  end
49
-
50
- metadata_hash
46
+ @metadata_ruby
51
47
  end
52
48
 
53
49
  def media_type
@@ -55,25 +51,37 @@ module Rika
55
51
  end
56
52
 
57
53
  def available_metadata
58
- self.parse
59
- @metadata.names.to_a
54
+ metadata.keys
60
55
  end
61
56
 
62
57
  def metadata_exists?(name)
63
- self.parse
64
- @metadata.get(name) != nil
58
+ metadata[name] != nil
59
+ end
60
+
61
+ def file?
62
+ @input_type == :file
65
63
  end
66
64
 
67
65
  protected
68
66
 
69
67
  def parse
70
- @content ||= @tika.parse_to_string(input_stream, @metadata).to_s.strip
68
+ @content ||= @tika.parse_to_string(input_stream, @metadata_java).to_s.strip
69
+ end
70
+
71
+ def get_input_type
72
+ if File.exists?(@uri) && File.directory?(@uri) == false
73
+ :file
74
+ elsif URI(@uri).scheme == "http" && Net::HTTP.get_response(URI(@uri)).is_a?(Net::HTTPSuccess)
75
+ :http
76
+ else
77
+ raise IOError, "Input (#{@uri}) is neither file nor http."
78
+ end
71
79
  end
72
80
 
73
81
  def input_stream
74
- if @is_file
82
+ if file?
75
83
  FileInputStream.new(java.io.File.new(@uri))
76
- else
84
+ else # :http
77
85
  URL.new(@uri).open_stream
78
86
  end
79
87
  end
data/lib/rika/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Rika
2
- VERSION = "0.9.7"
2
+ VERSION = "0.9.8"
3
3
  end
data/spec/rika_spec.rb CHANGED
@@ -15,7 +15,7 @@ describe Rika::Parser do
15
15
  @dir = File.expand_path(File.join(File.dirname(__FILE__), 'fixtures'))
16
16
  port = 50505
17
17
  @url = "http://#{Socket.gethostname}:#{port}"
18
- @qoute = "First they ignore you, then they ridicule you, then they fight you, then you win."
18
+ @quote = "First they ignore you, then they ridicule you, then they fight you, then you win."
19
19
  @t1 = Thread.new do
20
20
  @server = HTTPServer.new(:Port => port, :DocumentRoot => @dir,
21
21
  :AccessLog => [], :Logger => WEBrick::Log::new("/dev/null", 7))
@@ -28,11 +28,11 @@ describe Rika::Parser do
28
28
  end
29
29
 
30
30
  it "should raise error if file does not exists" do
31
- lambda { Rika::Parser.new(file_path("nonsense.txt")) }.should raise_error(IOError, "File does not exist or can't be reached.")
31
+ lambda { Rika::Parser.new(file_path("nonsense.txt")) }.should raise_error(IOError)
32
32
  end
33
33
 
34
34
  it "should raise error if URL does not exists" do
35
- lambda { Rika::Parser.new("http://nonsense.com/whatever.pdf") }.should raise_error(IOError, "File does not exist or can't be reached.")
35
+ lambda { Rika::Parser.new("http://nonsense.com/whatever.pdf") }.should raise_error(IOError)
36
36
  end
37
37
 
38
38
  it "should detect file type without a file extension" do
@@ -41,20 +41,20 @@ describe Rika::Parser do
41
41
  end
42
42
 
43
43
  it "should not be possible to trick the parser to read a folder with an extension" do
44
- lambda { Rika::Parser.new(file_path("folder.js")).content }.should raise_error(IOError, "File does not exist or can't be reached.")
44
+ lambda { Rika::Parser.new(file_path("folder.js")).content }.should raise_error(IOError)
45
45
  end
46
46
 
47
47
  describe '#content' do
48
48
  it "should return the content in a text file" do
49
- @txt_parser.content.strip.should == @qoute
49
+ @txt_parser.content.strip.should == @quote
50
50
  end
51
51
 
52
52
  it "should return the content in a docx file" do
53
- @docx_parser.content.should == @qoute
53
+ @docx_parser.content.should == @quote
54
54
  end
55
55
 
56
56
  it "should return the content in a pdf file" do
57
- @pdf_parser.content.should == @qoute
57
+ @pdf_parser.content.should == @quote
58
58
  end
59
59
 
60
60
  it "should return no content for an image" do
@@ -78,7 +78,7 @@ describe Rika::Parser do
78
78
 
79
79
  it "should return the content from a file over http" do
80
80
  parser = Rika::Parser.new(@url + "/document.pdf")
81
- parser.content.should == @qoute
81
+ parser.content.should == @quote
82
82
  end
83
83
 
84
84
  it "should return empty string for unknown file" do
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rika
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.7
4
+ version: 0.9.8
5
5
  prerelease:
6
6
  platform: java
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-01-22 00:00:00.000000000 Z
12
+ date: 2013-01-25 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rspec