jruby-boilerpipe 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/lib/boilerpipe/extractors/article_extractor.rb +23 -0
- data/lib/boilerpipe/sax/boilerpipe_html_parser.rb +17 -0
- data/lib/boilerpipe/version.rb +1 -1
- data/lib/boilerpipe.rb +3 -25
- metadata +15 -13
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 8a36efcb938933662d44ad33cd1887e99c34edf1376d36fc383a5a0424b4b797
|
4
|
+
data.tar.gz: ccdab7fda06435dfc91effb80f1edae0d810c6ef44b728a3e60710275a08d3da
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cda5bbaf4fb99bfa863bcc718d66145481c23c7ded8ccfc57b6a8e6a7280d9e89f67c00c388719f3652727c027158dc19f92f84646bfdc89320f1f05f3e5f74c
|
7
|
+
data.tar.gz: e5bfef4c89ebbe6ba8658fcb653d6605f9a6253eaaa14ff9f13b23840a273c6ba4f495c6099d33852d4c68d655c9da2324606ed4c349412abafda3292f4e8608
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module Boilerpipe
|
2
|
+
java_import 'com.kohlschutter.boilerpipe.extractors.ArticleExtractor'
|
3
|
+
java_import 'com.kohlschutter.boilerpipe.util.UnicodeTokenizer'
|
4
|
+
java_import java.net.URL
|
5
|
+
|
6
|
+
class ArticleExtractor
|
7
|
+
def self.get_text(s)
|
8
|
+
url = nil
|
9
|
+
|
10
|
+
begin
|
11
|
+
url = Java::JavaNet::URL.new(s)
|
12
|
+
rescue Java::JavaNet::MalformedURLException => e
|
13
|
+
# not a URL
|
14
|
+
end
|
15
|
+
input = url ? url : s
|
16
|
+
ArticleExtractor::INSTANCE.get_text(input)
|
17
|
+
end
|
18
|
+
|
19
|
+
class <<self
|
20
|
+
alias_method :text, :get_text
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module Boilerpipe
|
2
|
+
module SAX
|
3
|
+
java_import 'com.kohlschutter.boilerpipe.sax.BoilerpipeHTMLParser'
|
4
|
+
java_import 'org.xml.sax.InputSource'
|
5
|
+
java_import java.io.StringReader
|
6
|
+
|
7
|
+
class BoilerpipeHTMLParser
|
8
|
+
def self.parse(text)
|
9
|
+
parser = BoilerpipeHTMLParser.new
|
10
|
+
string_reader = StringReader.new(text)
|
11
|
+
is = InputSource.new(string_reader)
|
12
|
+
parser.parse(is)
|
13
|
+
parser.to_text_document
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
data/lib/boilerpipe/version.rb
CHANGED
data/lib/boilerpipe.rb
CHANGED
@@ -1,26 +1,4 @@
|
|
1
|
-
require 'boilerpipe/version'
|
2
1
|
require_relative 'boilerpipe-common-2.0-SNAPSHOT-jar-with-dependencies.jar'
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
java_import 'com.kohlschutter.boilerpipe.util.UnicodeTokenizer'
|
7
|
-
java_import java.net.URL
|
8
|
-
|
9
|
-
class ArticleExtractor
|
10
|
-
def self.get_text(s)
|
11
|
-
url = nil
|
12
|
-
|
13
|
-
begin
|
14
|
-
url = Java::JavaNet::URL.new(s)
|
15
|
-
rescue Java::JavaNet::MalformedURLException => e
|
16
|
-
# not a URL
|
17
|
-
end
|
18
|
-
input = url ? url : s
|
19
|
-
ArticleExtractor::INSTANCE.get_text(input)
|
20
|
-
end
|
21
|
-
|
22
|
-
class <<self
|
23
|
-
alias_method :text, :get_text
|
24
|
-
end
|
25
|
-
end
|
26
|
-
end
|
2
|
+
require 'boilerpipe/version'
|
3
|
+
require 'boilerpipe/sax/boilerpipe_html_parser'
|
4
|
+
require 'boilerpipe/extractors/article_extractor'
|
metadata
CHANGED
@@ -1,52 +1,52 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: jruby-boilerpipe
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gregory Ostermayr
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-
|
11
|
+
date: 2017-09-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
|
-
name: bundler
|
15
14
|
requirement: !ruby/object:Gem::Requirement
|
16
15
|
requirements:
|
17
16
|
- - "~>"
|
18
17
|
- !ruby/object:Gem::Version
|
19
18
|
version: '1.10'
|
20
|
-
|
19
|
+
name: bundler
|
21
20
|
prerelease: false
|
21
|
+
type: :development
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '1.10'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
|
-
name: rake
|
29
28
|
requirement: !ruby/object:Gem::Requirement
|
30
29
|
requirements:
|
31
30
|
- - "~>"
|
32
31
|
- !ruby/object:Gem::Version
|
33
32
|
version: '10.0'
|
34
|
-
|
33
|
+
name: rake
|
35
34
|
prerelease: false
|
35
|
+
type: :development
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '10.0'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
|
-
name: rspec
|
43
42
|
requirement: !ruby/object:Gem::Requirement
|
44
43
|
requirements:
|
45
44
|
- - ">="
|
46
45
|
- !ruby/object:Gem::Version
|
47
46
|
version: '0'
|
48
|
-
|
47
|
+
name: rspec
|
49
48
|
prerelease: false
|
49
|
+
type: :development
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
52
|
- - ">="
|
@@ -69,11 +69,13 @@ files:
|
|
69
69
|
- jruby-boilerpipe.gemspec
|
70
70
|
- lib/boilerpipe-common-2.0-SNAPSHOT-jar-with-dependencies.jar
|
71
71
|
- lib/boilerpipe.rb
|
72
|
+
- lib/boilerpipe/extractors/article_extractor.rb
|
73
|
+
- lib/boilerpipe/sax/boilerpipe_html_parser.rb
|
72
74
|
- lib/boilerpipe/version.rb
|
73
75
|
homepage: https://github.com/gregors/jruby-boilerpipe
|
74
76
|
licenses: []
|
75
77
|
metadata: {}
|
76
|
-
post_install_message:
|
78
|
+
post_install_message:
|
77
79
|
rdoc_options: []
|
78
80
|
require_paths:
|
79
81
|
- lib
|
@@ -88,9 +90,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
88
90
|
- !ruby/object:Gem::Version
|
89
91
|
version: '0'
|
90
92
|
requirements: []
|
91
|
-
rubyforge_project:
|
92
|
-
rubygems_version: 2.
|
93
|
-
signing_key:
|
93
|
+
rubyforge_project:
|
94
|
+
rubygems_version: 2.6.11
|
95
|
+
signing_key:
|
94
96
|
specification_version: 4
|
95
97
|
summary: Ruby wrapper around boilerpipe java library
|
96
98
|
test_files: []
|