rtika 0.2.0 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -20,6 +20,11 @@ Make sure you're on JRuby first.
20
20
  puts result.content # returns <body> contents
21
21
  puts result.title # returns <title> contents
22
22
 
23
+ Options
24
+
25
+ :remove_boilerplate => true
26
+ # uses the Boilerpipe library that ships with Tika to remove headers & footers
27
+
23
28
  == Note on Patches/Pull Requests
24
29
 
25
30
  * Fork the project.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.2.0
1
+ 0.3.0
@@ -7,7 +7,9 @@ Dir[File.join(File.dirname(__FILE__), "*.jar")].each do |jar|
7
7
  end
8
8
 
9
9
  module RTika
10
+ import org.apache.tika.parser.html.BoilerpipeContentHandler
10
11
  import org.apache.tika.sax.BodyContentHandler
12
+ import org.apache.tika.sax.WriteOutContentHandler
11
13
  import org.apache.tika.parser.AutoDetectParser
12
14
  import org.apache.tika.metadata.Metadata
13
15
 
@@ -43,10 +45,32 @@ module RTika
43
45
  new(*args).parse
44
46
  end
45
47
 
48
+ def remove_boilerplate?
49
+ @options[:remove_boilerplate] && @options[:remove_boilerplate] == true
50
+ end
51
+
52
+ def initialize(*args)
53
+ @options = args.last
54
+
55
+ if remove_boilerplate?
56
+ @writeout_content = RTika::WriteOutContentHandler.new(-1)
57
+ @content = RTika::BoilerpipeContentHandler.new(@writeout_content)
58
+ else
59
+ @content = RTika::BodyContentHandler.new(-1)
60
+ end
61
+
62
+ @metadata = RTika::Metadata.new
63
+ end
64
+
46
65
  def parse
47
66
  @parser = RTika::AutoDetectParser.new
48
- content, metadata = process
49
- RTika::ParsedResult.new(content, metadata)
67
+ @content, @metadata = process
68
+
69
+ if remove_boilerplate?
70
+ RTika::ParsedResult.new(@writeout_content, @metadata)
71
+ else
72
+ RTika::ParsedResult.new(@content, @metadata)
73
+ end
50
74
  end
51
75
 
52
76
  def process
@@ -55,56 +79,53 @@ module RTika
55
79
  end
56
80
 
57
81
  class StringParser < GenericParser
58
- def initialize(string)
82
+ def initialize(string, opts={})
83
+ super(opts)
59
84
  @input_string = string
60
85
  end
61
86
 
62
87
  def process
63
88
  input_stream = java.io.ByteArrayInputStream.new(@input_string.to_java.get_bytes)
64
- content = RTika::BodyContentHandler.new(-1)
65
- metadata = RTika::Metadata.new
66
89
 
67
- @parser.parse(input_stream, content, metadata)
90
+ @parser.parse(input_stream, @content, @metadata)
68
91
  input_stream.close
69
92
 
70
- return [content, metadata]
93
+ return [@content, @metadata]
71
94
  end
72
95
  end
73
96
 
74
97
  class FileParser < GenericParser
75
- def initialize(filename)
98
+ def initialize(filename, opts={})
99
+ super(opts)
76
100
  @filename = filename
77
101
  end
78
102
 
79
103
  def process
80
104
  input_stream = java.io.FileInputStream.new(java.io.File.new(@filename))
81
- content = RTika::BodyContentHandler.new(-1)
82
- metadata = RTika::Metadata.new
83
- metadata.set("filename", File.basename(@filename))
105
+ @metadata.set("filename", File.basename(@filename))
84
106
 
85
- @parser.parse(input_stream, content, metadata)
107
+ @parser.parse(input_stream, @content, @metadata)
86
108
  input_stream.close
87
109
 
88
- return [content, metadata]
110
+ return [@content, @metadata]
89
111
  end
90
112
  end
91
113
 
92
114
  class UrlParser < GenericParser
93
- def initialize(url, content)
115
+ def initialize(url, content, opts={})
116
+ super(opts)
94
117
  @url = url
95
- @content = content
118
+ @url_content = content
96
119
  end
97
120
 
98
121
  def process
99
- input_stream = java.io.ByteArrayInputStream.new(@content.to_java.get_bytes)
100
- content = RTika::BodyContentHandler.new(-1)
101
- metadata = RTika::Metadata.new
102
- metadata.set("filename", File.basename(@url))
122
+ input_stream = java.io.ByteArrayInputStream.new(@url_content.to_java.get_bytes)
123
+ @metadata.set("filename", File.basename(@url))
103
124
 
104
- @parser.parse(input_stream, content, metadata)
125
+ @parser.parse(input_stream, @content, @metadata)
105
126
  input_stream.close
106
127
 
107
- return [content, metadata]
128
+ return [@content, @metadata]
108
129
  end
109
130
  end
110
131
  end
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{rtika}
8
- s.version = "0.2.0"
8
+ s.version = "0.3.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Pradeep Elankumaran"]
12
- s.date = %q{2010-11-03}
12
+ s.date = %q{2010-11-09}
13
13
  s.description = %q{rTika is a JRuby wrapper around the Apache Tika content extraction library}
14
14
  s.email = %q{pradeepe@gmail.com}
15
15
  s.extra_rdoc_files = [
@@ -24,10 +24,10 @@ Gem::Specification.new do |s|
24
24
  "Rakefile",
25
25
  "VERSION",
26
26
  "lib/rtika.rb",
27
- "lib/tika-app-0.7.jar",
28
- "lib/tika-bundle-0.7.jar",
29
- "lib/tika-core-0.7.jar",
30
- "lib/tika-parsers-0.7.jar",
27
+ "lib/tika-app-0.9-SNAPSHOT.jar",
28
+ "lib/tika-bundle-0.9-SNAPSHOT.jar",
29
+ "lib/tika-core-0.9-SNAPSHOT.jar",
30
+ "lib/tika-parsers-0.9-SNAPSHOT.jar",
31
31
  "rtika.gemspec",
32
32
  "test/helper.rb",
33
33
  "test/test_rtika.rb"
metadata CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
4
4
  prerelease: false
5
5
  segments:
6
6
  - 0
7
- - 2
7
+ - 3
8
8
  - 0
9
- version: 0.2.0
9
+ version: 0.3.0
10
10
  platform: ruby
11
11
  authors:
12
12
  - Pradeep Elankumaran
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2010-11-03 00:00:00 -07:00
17
+ date: 2010-11-09 00:00:00 -08:00
18
18
  default_executable:
19
19
  dependencies: []
20
20
 
@@ -35,10 +35,10 @@ files:
35
35
  - Rakefile
36
36
  - VERSION
37
37
  - lib/rtika.rb
38
- - lib/tika-app-0.7.jar
39
- - lib/tika-bundle-0.7.jar
40
- - lib/tika-core-0.7.jar
41
- - lib/tika-parsers-0.7.jar
38
+ - lib/tika-app-0.9-SNAPSHOT.jar
39
+ - lib/tika-bundle-0.9-SNAPSHOT.jar
40
+ - lib/tika-core-0.9-SNAPSHOT.jar
41
+ - lib/tika-parsers-0.9-SNAPSHOT.jar
42
42
  - rtika.gemspec
43
43
  - test/helper.rb
44
44
  - test/test_rtika.rb
Binary file
Binary file