rika 0.9.2-java → 0.9.3-java

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -39,7 +39,11 @@ Something like this:
39
39
 
40
40
  # Return only the first 10000 chars of the content:
41
41
  parser = Rika::Parser.new('document.pdf', 10000)
42
- parser.content # 10000 first chars returned
42
+ parser.content # 10000 first chars returned
43
+
44
+ # Return content from URL
45
+ parser = Rika::Parser.new('http://www.exampleurl.com/example.pdf')
46
+ parser.content
43
47
  ```
44
48
  ## Contributing
45
49
 
data/lib/rika.rb CHANGED
@@ -1,24 +1,34 @@
1
1
  raise "You need to run JRuby to use Rika" unless RUBY_PLATFORM =~ /java/
2
2
 
3
3
  require "rika/version"
4
+ require 'uri'
5
+ require 'net/http'
4
6
  require 'java'
5
7
 
6
8
  Dir[File.join(File.dirname(__FILE__), "../target/dependency/*.jar")].each do |jar|
7
9
  require jar
8
10
  end
9
11
 
12
+ # Heavily based on the Apache Tika API: http://tika.apache.org/1.2/api/org/apache/tika/Tika.html
10
13
  module Rika
11
14
  import org.apache.tika.metadata.Metadata
12
15
  import org.apache.tika.Tika
16
+
13
17
  class Parser
14
18
 
15
- def initialize(filename, max_content_length = -1)
16
- if File.exists?(filename)
17
- @filename = filename
18
- @max_content_length = max_content_length
19
- self.perform
19
+ def initialize(uri, max_content_length = -1)
20
+ p = URI::Parser.new
21
+ @uri = uri
22
+ @tika = Tika.new
23
+ @tika.set_max_string_length(max_content_length)
24
+ @metadata = Metadata.new
25
+
26
+ if File.exists?(@uri)
27
+ self.parse_file
28
+ elsif p.parse(@uri).scheme == 'http' || p.parse(@uri).scheme == 'https'
29
+ self.parse_url
20
30
  else
21
- raise IOError, "File does not exist"
31
+ raise IOError, "File does not exist or can't be reached."
22
32
  end
23
33
  end
24
34
 
@@ -41,26 +51,23 @@ module Rika
41
51
  end
42
52
 
43
53
  def metadata_exists?(name)
44
- if @metadata.get(name) == nil
45
- false
46
- else
47
- true
48
- end
54
+ @metadata.get(name) != nil
49
55
  end
50
56
 
51
57
  protected
52
58
 
53
- def perform
54
- input_stream = nil
55
- begin
56
- input_stream = java.io.FileInputStream.new(java.io.File.new(@filename))
57
- @metadata = Metadata.new
58
- @metadata.set("filename", File.basename(@filename))
59
- @tika = Tika.new
60
- @content = @tika.parse_to_string(input_stream, @metadata, @max_content_length)
61
- ensure
62
- input_stream.close
63
- end
59
+ def parse_file
60
+ input_stream = java.io.FileInputStream.new(java.io.File.new(@uri))
61
+ @metadata.set("filename", File.basename(@uri))
62
+ @content = @tika.parse_to_string(input_stream, @metadata)
63
+ end
64
+
65
+ def parse_url
66
+ raise IOError, "File does not exist or can't be reached." if not Net::HTTP.get_response(URI(@uri)).is_a?(Net::HTTPSuccess)
67
+ url = java.net.URL.new(@uri)
68
+ input_stream = url.open_stream
69
+ @metadata.set("url", @uri)
70
+ @content = @tika.parse_to_string(input_stream, @metadata)
64
71
  end
65
72
  end
66
73
  end
data/lib/rika/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Rika
2
- VERSION = "0.9.2"
2
+ VERSION = "0.9.3"
3
3
  end
data/spec/rika_spec.rb CHANGED
@@ -1,17 +1,37 @@
1
1
  # encoding: utf-8
2
2
 
3
3
  require 'spec_helper'
4
+ require 'webrick'
4
5
 
6
+ include WEBrick
7
+
5
8
  describe Rika::Parser do
6
9
  before(:all) do
7
10
  @txt_parser = Rika::Parser.new(file_path("text_file.txt"))
8
11
  @docx_parser = Rika::Parser.new(file_path("document.docx"))
9
12
  @pdf_parser = Rika::Parser.new(file_path("document.pdf"))
10
13
  @image_parser = Rika::Parser.new(file_path("image.jpg"))
14
+ @dir = File.expand_path(File.join(File.dirname(__FILE__), 'fixtures'))
15
+ port = 50505
16
+ @url = "http://#{Socket.gethostname}:#{port}"
17
+
18
+ @t1 = Thread.new do
19
+ @server = HTTPServer.new(:Port => port, :DocumentRoot => @dir,
20
+ :AccessLog => [], :Logger => WEBrick::Log::new("/dev/null", 7))
21
+ @server.start
22
+ end
23
+ end
24
+
25
+ after(:all) do
26
+ @t1.exit
11
27
  end
12
28
 
13
29
  it "should raise error if file does not exists" do
14
- lambda { Rika::Parser.new(file_path("nonsense.txt")) }.should raise_error(IOError, "File does not exist")
30
+ lambda { Rika::Parser.new(file_path("nonsense.txt")) }.should raise_error(IOError, "File does not exist or can't be reached.")
31
+ end
32
+
33
+ it "should raise error if URL does not exists" do
34
+ lambda { Rika::Parser.new("http://nonsense.com/whatever.pdf") }.should raise_error(IOError, "File does not exist or can't be reached.")
15
35
  end
16
36
 
17
37
  it "should detect file type without a file extension" do
@@ -41,10 +61,20 @@ describe Rika::Parser do
41
61
  parser.content.should == "First"
42
62
  end
43
63
 
64
+ it "should only return max content length for file over http" do
65
+ parser = Rika::Parser.new(@url + "/document.pdf", 6)
66
+ parser.content.should == "First"
67
+ end
68
+
44
69
  it "should be possible to read files over 100k by default" do
45
70
  parser = Rika::Parser.new(file_path("over_100k_file.txt"))
46
71
  parser.content.length.should == 101_761
47
72
  end
73
+
74
+ it "should return the content from a file over http" do
75
+ parser = Rika::Parser.new(@url + "/document.pdf")
76
+ parser.content.should == "First they ignore you, then they ridicule you, then they fight you, then you win."
77
+ end
48
78
  end
49
79
 
50
80
  # We just test a few of the metadata fields for some common file formats
@@ -67,6 +97,11 @@ describe Rika::Parser do
67
97
  @pdf_parser.metadata["title"].should == "A simple title"
68
98
  end
69
99
 
100
+ it "should return metadata from a file over http" do
101
+ parser = Rika::Parser.new(@url + "/document.pdf")
102
+ parser.metadata["title"].should == "A simple title"
103
+ end
104
+
70
105
  it "should return metadata from an image" do
71
106
  @image_parser.metadata["Image Height"].should == "72 pixels"
72
107
  @image_parser.metadata["Image Width"].should == "72 pixels"
data/spec/spec_helper.rb CHANGED
@@ -3,6 +3,7 @@ require "rika"
3
3
  def file_path( *paths )
4
4
  File.expand_path(File.join(File.dirname(__FILE__), 'fixtures', *paths))
5
5
  end
6
+
6
7
  # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
7
8
  RSpec.configure do |config|
8
9
  config.treat_symbols_as_metadata_keys_with_true_values = true
metadata CHANGED
@@ -1,15 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rika
3
3
  version: !ruby/object:Gem::Version
4
+ version: 0.9.3
4
5
  prerelease:
5
- version: 0.9.2
6
6
  platform: java
7
7
  authors:
8
8
  - Richard Nyström
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-09-17 00:00:00.000000000 Z
12
+ date: 2012-09-23 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rspec