rtika 0.2.0 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +5 -0
- data/VERSION +1 -1
- data/lib/rtika.rb +42 -21
- data/lib/{tika-app-0.7.jar → tika-app-0.9-SNAPSHOT.jar} +0 -0
- data/lib/tika-bundle-0.9-SNAPSHOT.jar +0 -0
- data/lib/{tika-core-0.7.jar → tika-core-0.9-SNAPSHOT.jar} +0 -0
- data/lib/tika-parsers-0.9-SNAPSHOT.jar +0 -0
- data/rtika.gemspec +6 -6
- metadata +7 -7
- data/lib/tika-bundle-0.7.jar +0 -0
- data/lib/tika-parsers-0.7.jar +0 -0
data/README.rdoc
CHANGED
@@ -20,6 +20,11 @@ Make sure you're on JRuby first.
|
|
20
20
|
puts result.content # returns <body> contents
|
21
21
|
puts result.title # returns <title> contents
|
22
22
|
|
23
|
+
Options
|
24
|
+
|
25
|
+
:remove_boilerplate => true
|
26
|
+
# uses the Boilerpipe library that ships with Tika to remove headers & footers
|
27
|
+
|
23
28
|
== Note on Patches/Pull Requests
|
24
29
|
|
25
30
|
* Fork the project.
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.3.0
|
data/lib/rtika.rb
CHANGED
@@ -7,7 +7,9 @@ Dir[File.join(File.dirname(__FILE__), "*.jar")].each do |jar|
|
|
7
7
|
end
|
8
8
|
|
9
9
|
module RTika
|
10
|
+
import org.apache.tika.parser.html.BoilerpipeContentHandler
|
10
11
|
import org.apache.tika.sax.BodyContentHandler
|
12
|
+
import org.apache.tika.sax.WriteOutContentHandler
|
11
13
|
import org.apache.tika.parser.AutoDetectParser
|
12
14
|
import org.apache.tika.metadata.Metadata
|
13
15
|
|
@@ -43,10 +45,32 @@ module RTika
|
|
43
45
|
new(*args).parse
|
44
46
|
end
|
45
47
|
|
48
|
+
def remove_boilerplate?
|
49
|
+
@options[:remove_boilerplate] && @options[:remove_boilerplate] == true
|
50
|
+
end
|
51
|
+
|
52
|
+
def initialize(*args)
|
53
|
+
@options = args.last
|
54
|
+
|
55
|
+
if remove_boilerplate?
|
56
|
+
@writeout_content = RTika::WriteOutContentHandler.new(-1)
|
57
|
+
@content = RTika::BoilerpipeContentHandler.new(@writeout_content)
|
58
|
+
else
|
59
|
+
@content = RTika::BodyContentHandler.new(-1)
|
60
|
+
end
|
61
|
+
|
62
|
+
@metadata = RTika::Metadata.new
|
63
|
+
end
|
64
|
+
|
46
65
|
def parse
|
47
66
|
@parser = RTika::AutoDetectParser.new
|
48
|
-
content, metadata = process
|
49
|
-
|
67
|
+
@content, @metadata = process
|
68
|
+
|
69
|
+
if remove_boilerplate?
|
70
|
+
RTika::ParsedResult.new(@writeout_content, @metadata)
|
71
|
+
else
|
72
|
+
RTika::ParsedResult.new(@content, @metadata)
|
73
|
+
end
|
50
74
|
end
|
51
75
|
|
52
76
|
def process
|
@@ -55,56 +79,53 @@ module RTika
|
|
55
79
|
end
|
56
80
|
|
57
81
|
class StringParser < GenericParser
|
58
|
-
def initialize(string)
|
82
|
+
def initialize(string, opts={})
|
83
|
+
super(opts)
|
59
84
|
@input_string = string
|
60
85
|
end
|
61
86
|
|
62
87
|
def process
|
63
88
|
input_stream = java.io.ByteArrayInputStream.new(@input_string.to_java.get_bytes)
|
64
|
-
content = RTika::BodyContentHandler.new(-1)
|
65
|
-
metadata = RTika::Metadata.new
|
66
89
|
|
67
|
-
@parser.parse(input_stream, content, metadata)
|
90
|
+
@parser.parse(input_stream, @content, @metadata)
|
68
91
|
input_stream.close
|
69
92
|
|
70
|
-
return [content, metadata]
|
93
|
+
return [@content, @metadata]
|
71
94
|
end
|
72
95
|
end
|
73
96
|
|
74
97
|
class FileParser < GenericParser
|
75
|
-
def initialize(filename)
|
98
|
+
def initialize(filename, opts={})
|
99
|
+
super(opts)
|
76
100
|
@filename = filename
|
77
101
|
end
|
78
102
|
|
79
103
|
def process
|
80
104
|
input_stream = java.io.FileInputStream.new(java.io.File.new(@filename))
|
81
|
-
|
82
|
-
metadata = RTika::Metadata.new
|
83
|
-
metadata.set("filename", File.basename(@filename))
|
105
|
+
@metadata.set("filename", File.basename(@filename))
|
84
106
|
|
85
|
-
@parser.parse(input_stream, content, metadata)
|
107
|
+
@parser.parse(input_stream, @content, @metadata)
|
86
108
|
input_stream.close
|
87
109
|
|
88
|
-
return [content, metadata]
|
110
|
+
return [@content, @metadata]
|
89
111
|
end
|
90
112
|
end
|
91
113
|
|
92
114
|
class UrlParser < GenericParser
|
93
|
-
def initialize(url, content)
|
115
|
+
def initialize(url, content, opts={})
|
116
|
+
super(opts)
|
94
117
|
@url = url
|
95
|
-
@
|
118
|
+
@url_content = content
|
96
119
|
end
|
97
120
|
|
98
121
|
def process
|
99
|
-
input_stream = java.io.ByteArrayInputStream.new(@
|
100
|
-
|
101
|
-
metadata = RTika::Metadata.new
|
102
|
-
metadata.set("filename", File.basename(@url))
|
122
|
+
input_stream = java.io.ByteArrayInputStream.new(@url_content.to_java.get_bytes)
|
123
|
+
@metadata.set("filename", File.basename(@url))
|
103
124
|
|
104
|
-
@parser.parse(input_stream, content, metadata)
|
125
|
+
@parser.parse(input_stream, @content, @metadata)
|
105
126
|
input_stream.close
|
106
127
|
|
107
|
-
return [content, metadata]
|
128
|
+
return [@content, @metadata]
|
108
129
|
end
|
109
130
|
end
|
110
131
|
end
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
data/rtika.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{rtika}
|
8
|
-
s.version = "0.
|
8
|
+
s.version = "0.3.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Pradeep Elankumaran"]
|
12
|
-
s.date = %q{2010-11-
|
12
|
+
s.date = %q{2010-11-09}
|
13
13
|
s.description = %q{rTika is a JRuby wrapper around the Apache Tika content extraction library}
|
14
14
|
s.email = %q{pradeepe@gmail.com}
|
15
15
|
s.extra_rdoc_files = [
|
@@ -24,10 +24,10 @@ Gem::Specification.new do |s|
|
|
24
24
|
"Rakefile",
|
25
25
|
"VERSION",
|
26
26
|
"lib/rtika.rb",
|
27
|
-
"lib/tika-app-0.
|
28
|
-
"lib/tika-bundle-0.
|
29
|
-
"lib/tika-core-0.
|
30
|
-
"lib/tika-parsers-0.
|
27
|
+
"lib/tika-app-0.9-SNAPSHOT.jar",
|
28
|
+
"lib/tika-bundle-0.9-SNAPSHOT.jar",
|
29
|
+
"lib/tika-core-0.9-SNAPSHOT.jar",
|
30
|
+
"lib/tika-parsers-0.9-SNAPSHOT.jar",
|
31
31
|
"rtika.gemspec",
|
32
32
|
"test/helper.rb",
|
33
33
|
"test/test_rtika.rb"
|
metadata
CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
|
|
4
4
|
prerelease: false
|
5
5
|
segments:
|
6
6
|
- 0
|
7
|
-
-
|
7
|
+
- 3
|
8
8
|
- 0
|
9
|
-
version: 0.
|
9
|
+
version: 0.3.0
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Pradeep Elankumaran
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2010-11-
|
17
|
+
date: 2010-11-09 00:00:00 -08:00
|
18
18
|
default_executable:
|
19
19
|
dependencies: []
|
20
20
|
|
@@ -35,10 +35,10 @@ files:
|
|
35
35
|
- Rakefile
|
36
36
|
- VERSION
|
37
37
|
- lib/rtika.rb
|
38
|
-
- lib/tika-app-0.
|
39
|
-
- lib/tika-bundle-0.
|
40
|
-
- lib/tika-core-0.
|
41
|
-
- lib/tika-parsers-0.
|
38
|
+
- lib/tika-app-0.9-SNAPSHOT.jar
|
39
|
+
- lib/tika-bundle-0.9-SNAPSHOT.jar
|
40
|
+
- lib/tika-core-0.9-SNAPSHOT.jar
|
41
|
+
- lib/tika-parsers-0.9-SNAPSHOT.jar
|
42
42
|
- rtika.gemspec
|
43
43
|
- test/helper.rb
|
44
44
|
- test/test_rtika.rb
|
data/lib/tika-bundle-0.7.jar
DELETED
Binary file
|
data/lib/tika-parsers-0.7.jar
DELETED
Binary file
|