jakal 0.2.0 → 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile.lock +5 -8
- data/README.md +2 -54
- data/bin/convert_to_should_syntax +14 -0
- data/bin/edit_json.rb +14 -0
- data/bin/minitar +14 -0
- data/bin/nokogiri +14 -0
- data/bin/prettify_json.rb +14 -0
- data/bin/rake +14 -0
- data/bin/rdebug +14 -0
- data/bin/term-extract +14 -0
- data/bin/testrb +14 -0
- data/jkl.gemspec +2 -2
- data/lib/jkl.rb +5 -1
- data/lib/jkl/text_client.rb +7 -5
- data/test/unit/jkl_test.rb +8 -9
- data/test/unit/text_cleaning_test.rb +20 -7
- metadata +17 -9
- data/lib/jkl/calais_client.rb +0 -28
data/Gemfile.lock
CHANGED
@@ -2,30 +2,25 @@ PATH
|
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
4
|
jakal (0.2.0)
|
5
|
-
calais (>= 0.0.11)
|
6
5
|
mechanize (>= 1.0.0)
|
7
6
|
nokogiri (~> 1.4.4)
|
8
7
|
rake (>= 0.8.7)
|
8
|
+
term-extract (~> 0.5.1)
|
9
9
|
|
10
10
|
GEM
|
11
11
|
remote: http://rubygems.org/
|
12
12
|
specs:
|
13
13
|
addressable (2.2.6)
|
14
14
|
archive-tar-minitar (0.5.2)
|
15
|
-
calais (0.0.11)
|
16
|
-
curb (>= 0.1.4)
|
17
|
-
json (>= 1.1.3)
|
18
|
-
nokogiri (>= 1.3.3)
|
19
15
|
columnize (0.3.2)
|
20
16
|
crack (0.1.8)
|
21
|
-
curb (0.7.15)
|
22
|
-
json (1.5.1)
|
23
17
|
linecache19 (0.5.12)
|
24
18
|
ruby_core_source (>= 0.1.4)
|
25
19
|
mechanize (1.0.0)
|
26
20
|
nokogiri (>= 1.2.1)
|
27
21
|
nokogiri (1.4.4)
|
28
22
|
rake (0.8.7)
|
23
|
+
rbtagger (0.4.7)
|
29
24
|
ruby-debug-base19 (0.11.25)
|
30
25
|
columnize (>= 0.3.1)
|
31
26
|
linecache19 (>= 0.5.11)
|
@@ -37,6 +32,8 @@ GEM
|
|
37
32
|
ruby_core_source (0.1.5)
|
38
33
|
archive-tar-minitar (>= 0.5.2)
|
39
34
|
shoulda (2.11.3)
|
35
|
+
term-extract (0.5.1)
|
36
|
+
rbtagger
|
40
37
|
test-unit (2.3.0)
|
41
38
|
webmock (1.6.2)
|
42
39
|
addressable (>= 2.2.2)
|
@@ -46,12 +43,12 @@ PLATFORMS
|
|
46
43
|
ruby
|
47
44
|
|
48
45
|
DEPENDENCIES
|
49
|
-
calais (>= 0.0.11)
|
50
46
|
jakal!
|
51
47
|
mechanize (>= 1.0.0)
|
52
48
|
nokogiri (~> 1.4.4)
|
53
49
|
rake (>= 0.8.7)
|
54
50
|
ruby-debug19 (= 0.11.6)
|
55
51
|
shoulda (= 2.11.3)
|
52
|
+
term-extract (~> 0.5.1)
|
56
53
|
test-unit (= 2.3.0)
|
57
54
|
webmock (= 1.6.2)
|
data/README.md
CHANGED
@@ -1,59 +1,7 @@
|
|
1
1
|
# jkl
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
* Connects to URLs.
|
6
|
-
* Gets stuff out of RSS feeds.
|
7
|
-
* Gets the main content from web pages
|
8
|
-
* Gets a set of metadata from a web page (using the calais gem)
|
9
|
-
|
10
|
-
# Sample usage
|
11
|
-
|
12
|
-
For example - if you had a RSS feed:
|
13
|
-
|
14
|
-
require "jkl"
|
15
|
-
|
16
|
-
feed = "http://www.topix.net/rss/search/article?x=0&y=0&q=London"
|
17
|
-
|
18
|
-
You could collect some metadata from the links in that feed, thus:
|
19
|
-
|
20
|
-
tags = []
|
21
|
-
Jkl::links(feed).each do |link|
|
22
|
-
tags << Jkl::tags("my_calais_key",link)
|
23
|
-
end
|
24
|
-
|
25
|
-
A metadata sample might look something like this:
|
26
|
-
|
27
|
-
{
|
28
|
-
"Person"=>["Barack Obama", "Hillary Clinton"],
|
29
|
-
"Position"=>["Secretary of State"]
|
30
|
-
}
|
3
|
+
Jakal is a Ruby library for tagging keywords from web pages.
|
31
4
|
|
32
5
|
It is hosted at [gemcutter](http://gemcutter.org/gems/jakal)
|
33
6
|
|
34
|
-
|
35
|
-
|
36
|
-
# LICENSE:
|
37
|
-
|
38
|
-
(The MIT License)
|
39
|
-
|
40
|
-
Copyright (c) 2009 sshingler
|
41
|
-
|
42
|
-
Permission is hereby granted, free of charge, to any person obtaining
|
43
|
-
a copy of this software and associated documentation files (the
|
44
|
-
'Software'), to deal in the Software without restriction, including
|
45
|
-
without limitation the rights to use, copy, modify, merge, publish,
|
46
|
-
distribute, sublicense, and/or sell copies of the Software, and to
|
47
|
-
permit persons to whom the Software is furnished to do so, subject to
|
48
|
-
the following conditions:
|
49
|
-
|
50
|
-
The above copyright notice and this permission notice shall be
|
51
|
-
included in all copies or substantial portions of the Software.
|
52
|
-
|
53
|
-
THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
|
54
|
-
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
55
|
-
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
56
|
-
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
57
|
-
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
58
|
-
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
59
|
-
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
7
|
+
gem install jakal
|
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file was generated by Bundler.
|
4
|
+
#
|
5
|
+
# The application 'convert_to_should_syntax' is installed as part of a gem, and
|
6
|
+
# this file is here to facilitate running it.
|
7
|
+
#
|
8
|
+
|
9
|
+
ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile", __FILE__)
|
10
|
+
|
11
|
+
require 'rubygems'
|
12
|
+
require 'bundler/setup'
|
13
|
+
|
14
|
+
load Gem.bin_path('shoulda', 'convert_to_should_syntax')
|
data/bin/edit_json.rb
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file was generated by Bundler.
|
4
|
+
#
|
5
|
+
# The application 'edit_json.rb' is installed as part of a gem, and
|
6
|
+
# this file is here to facilitate running it.
|
7
|
+
#
|
8
|
+
|
9
|
+
ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile", __FILE__)
|
10
|
+
|
11
|
+
require 'rubygems'
|
12
|
+
require 'bundler/setup'
|
13
|
+
|
14
|
+
load Gem.bin_path('json', 'edit_json.rb')
|
data/bin/minitar
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file was generated by Bundler.
|
4
|
+
#
|
5
|
+
# The application 'minitar' is installed as part of a gem, and
|
6
|
+
# this file is here to facilitate running it.
|
7
|
+
#
|
8
|
+
|
9
|
+
ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile", __FILE__)
|
10
|
+
|
11
|
+
require 'rubygems'
|
12
|
+
require 'bundler/setup'
|
13
|
+
|
14
|
+
load Gem.bin_path('archive-tar-minitar', 'minitar')
|
data/bin/nokogiri
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file was generated by Bundler.
|
4
|
+
#
|
5
|
+
# The application 'nokogiri' is installed as part of a gem, and
|
6
|
+
# this file is here to facilitate running it.
|
7
|
+
#
|
8
|
+
|
9
|
+
ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile", __FILE__)
|
10
|
+
|
11
|
+
require 'rubygems'
|
12
|
+
require 'bundler/setup'
|
13
|
+
|
14
|
+
load Gem.bin_path('nokogiri', 'nokogiri')
|
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file was generated by Bundler.
|
4
|
+
#
|
5
|
+
# The application 'prettify_json.rb' is installed as part of a gem, and
|
6
|
+
# this file is here to facilitate running it.
|
7
|
+
#
|
8
|
+
|
9
|
+
ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile", __FILE__)
|
10
|
+
|
11
|
+
require 'rubygems'
|
12
|
+
require 'bundler/setup'
|
13
|
+
|
14
|
+
load Gem.bin_path('json', 'prettify_json.rb')
|
data/bin/rake
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file was generated by Bundler.
|
4
|
+
#
|
5
|
+
# The application 'rake' is installed as part of a gem, and
|
6
|
+
# this file is here to facilitate running it.
|
7
|
+
#
|
8
|
+
|
9
|
+
ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile", __FILE__)
|
10
|
+
|
11
|
+
require 'rubygems'
|
12
|
+
require 'bundler/setup'
|
13
|
+
|
14
|
+
load Gem.bin_path('rake', 'rake')
|
data/bin/rdebug
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file was generated by Bundler.
|
4
|
+
#
|
5
|
+
# The application 'rdebug' is installed as part of a gem, and
|
6
|
+
# this file is here to facilitate running it.
|
7
|
+
#
|
8
|
+
|
9
|
+
ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile", __FILE__)
|
10
|
+
|
11
|
+
require 'rubygems'
|
12
|
+
require 'bundler/setup'
|
13
|
+
|
14
|
+
load Gem.bin_path('ruby-debug19', 'rdebug')
|
data/bin/term-extract
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file was generated by Bundler.
|
4
|
+
#
|
5
|
+
# The application 'term-extract' is installed as part of a gem, and
|
6
|
+
# this file is here to facilitate running it.
|
7
|
+
#
|
8
|
+
|
9
|
+
ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile", __FILE__)
|
10
|
+
|
11
|
+
require 'rubygems'
|
12
|
+
require 'bundler/setup'
|
13
|
+
|
14
|
+
load Gem.bin_path('term-extract', 'term-extract')
|
data/bin/testrb
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file was generated by Bundler.
|
4
|
+
#
|
5
|
+
# The application 'testrb' is installed as part of a gem, and
|
6
|
+
# this file is here to facilitate running it.
|
7
|
+
#
|
8
|
+
|
9
|
+
ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile", __FILE__)
|
10
|
+
|
11
|
+
require 'rubygems'
|
12
|
+
require 'bundler/setup'
|
13
|
+
|
14
|
+
load Gem.bin_path('test-unit', 'testrb')
|
data/jkl.gemspec
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
Gem::Specification.new do |s|
|
2
2
|
s.name = %q{jakal}
|
3
|
-
s.version = "0.2.
|
3
|
+
s.version = "0.2.1"
|
4
4
|
s.platform = Gem::Platform::RUBY
|
5
5
|
s.authors = ['sshingler']
|
6
6
|
s.homepage = %q{http://github.com/sshingler/jkl}
|
@@ -15,5 +15,5 @@ Gem::Specification.new do |s|
|
|
15
15
|
s.add_dependency(%q<rake>, [">= 0.8.7"])
|
16
16
|
s.add_dependency(%q<mechanize>, [">= 1.0.0"])
|
17
17
|
s.add_dependency(%q<nokogiri>, ["~> 1.4.4"])
|
18
|
-
s.add_dependency(%q<
|
18
|
+
s.add_dependency(%q<term-extract>, ["~> 0.5.1"])
|
19
19
|
end
|
data/lib/jkl.rb
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
require_relative "jkl/rss_client"
|
2
|
-
require_relative "jkl/calais_client"
|
3
2
|
require_relative "jkl/text_client"
|
4
3
|
|
5
4
|
require "mechanize"
|
5
|
+
require "term-extract"
|
6
6
|
|
7
7
|
module Jkl
|
8
8
|
class << self
|
@@ -18,5 +18,9 @@ module Jkl
|
|
18
18
|
yield link if block_given?
|
19
19
|
end
|
20
20
|
end
|
21
|
+
|
22
|
+
def tags(text)
|
23
|
+
TermExtract.extract(text.force_encoding("UTF-8")).keys
|
24
|
+
end
|
21
25
|
end
|
22
26
|
end
|
data/lib/jkl/text_client.rb
CHANGED
@@ -3,8 +3,12 @@ module Jkl
|
|
3
3
|
class << self
|
4
4
|
|
5
5
|
def plain_text(document, words_on_line = 5)
|
6
|
-
|
6
|
+
CGI::unescapeHTML(
|
7
|
+
remove_short_lines(
|
8
|
+
strip_all_tags(
|
9
|
+
remove_script_tags(document)), words_on_line))
|
7
10
|
end
|
11
|
+
alias :sanitize :plain_text
|
8
12
|
|
9
13
|
def strip_all_tags(text)
|
10
14
|
text.gsub(/<\/?[^>]*>/, "")
|
@@ -26,13 +30,11 @@ module Jkl
|
|
26
30
|
def remove_short_lines(text, words_on_line = 5)
|
27
31
|
text = text.gsub(/\s\s/, "\n")
|
28
32
|
str = ""
|
29
|
-
|
30
|
-
|
31
|
-
str << l unless l.count(" ") < words_on_line
|
33
|
+
text.split("\n").each do |line|
|
34
|
+
str << "#{line}\n" unless line.count(" ") < words_on_line
|
32
35
|
end
|
33
36
|
str
|
34
37
|
end
|
35
|
-
|
36
38
|
end
|
37
39
|
end
|
38
40
|
end
|
data/test/unit/jkl_test.rb
CHANGED
@@ -2,13 +2,12 @@ require "test/unit"
|
|
2
2
|
require "shoulda"
|
3
3
|
require "webmock/test_unit"
|
4
4
|
require "yaml"
|
5
|
-
|
6
5
|
require_relative "../../lib/jkl"
|
7
6
|
|
8
7
|
class JklTest < Test::Unit::TestCase
|
9
8
|
include WebMock::API
|
10
9
|
|
11
|
-
context "
|
10
|
+
context "Jkl: When handling documents, plain text and tags" do
|
12
11
|
setup do
|
13
12
|
@url = "http://www.bbc.co.uk"
|
14
13
|
response = File.read('test/fixtures/bbc_story.html')
|
@@ -18,27 +17,27 @@ class JklTest < Test::Unit::TestCase
|
|
18
17
|
:headers => {'Content-Type' => 'text/html'})
|
19
18
|
end
|
20
19
|
|
21
|
-
should "
|
20
|
+
should "get a document from a URL" do
|
22
21
|
doc = Jkl::get(@url)
|
23
22
|
assert_not_nil doc
|
24
23
|
end
|
25
24
|
|
26
|
-
should "
|
25
|
+
should "get the plain text version of a document" do
|
27
26
|
document = Jkl::get(@url)
|
28
27
|
text = Jkl::Text::plain_text(document,2)
|
29
|
-
assert_equal
|
28
|
+
assert_equal 9009, text.length
|
30
29
|
end
|
31
30
|
|
32
|
-
should "
|
31
|
+
should "get the keywords from a document" do
|
33
32
|
document = Jkl::get(@url)
|
34
33
|
text = Jkl::Text::plain_text(document,2)
|
35
|
-
tags = Jkl::
|
34
|
+
tags = Jkl::tags(text)
|
36
35
|
assert ! tags.empty?
|
37
36
|
end
|
38
37
|
end
|
39
38
|
|
40
|
-
context "
|
41
|
-
should "
|
39
|
+
context "Jkl: When handling RSS" do
|
40
|
+
should "get links from a feed" do
|
42
41
|
feed = "http://feeds.bbci.co.uk/news/rss.xml"
|
43
42
|
response = File.read('test/fixtures/topix_rss.xml')
|
44
43
|
stub_request(:get, "http://feeds.bbci.co.uk/news/rss.xml").
|
@@ -13,7 +13,7 @@ the cat sat on the mat
|
|
13
13
|
a short line
|
14
14
|
HTML
|
15
15
|
result = Jkl::Text::remove_short_lines input
|
16
|
-
assert result == "the cat sat on the mat"
|
16
|
+
assert result == "the cat sat on the mat\n"
|
17
17
|
end
|
18
18
|
|
19
19
|
should "Remove shorter lines" do
|
@@ -23,7 +23,8 @@ the cat sat on the slightly fluffy, yet worn and homely mat
|
|
23
23
|
a short line
|
24
24
|
HTML
|
25
25
|
result = Jkl::Text::remove_short_lines(input, 8)
|
26
|
-
|
26
|
+
expected = "the cat sat on the slightly fluffy, yet worn and homely mat\n"
|
27
|
+
assert_equal expected, result
|
27
28
|
end
|
28
29
|
|
29
30
|
should "Remove script tags" do
|
@@ -35,7 +36,7 @@ function nofunction(){var bob;}
|
|
35
36
|
a short line
|
36
37
|
HTML
|
37
38
|
result = Jkl::Text::remove_short_lines input
|
38
|
-
assert result == "the cat sat on the mat"
|
39
|
+
assert result == "the cat sat on the mat\n"
|
39
40
|
end
|
40
41
|
|
41
42
|
should "Remove html comments" do
|
@@ -45,7 +46,7 @@ the cat sat on the mat
|
|
45
46
|
a short line
|
46
47
|
HTML
|
47
48
|
result = Jkl::Text::remove_short_lines input
|
48
|
-
assert result == "the cat sat on the mat"
|
49
|
+
assert result == "the cat sat on the mat\n"
|
49
50
|
end
|
50
51
|
|
51
52
|
should "Remove blank lines" do
|
@@ -55,7 +56,7 @@ the cat sat on the mat
|
|
55
56
|
a short line
|
56
57
|
HTML
|
57
58
|
result = Jkl::Text::remove_short_lines input
|
58
|
-
assert result == "the cat sat on the mat"
|
59
|
+
assert result == "the cat sat on the mat\n"
|
59
60
|
end
|
60
61
|
|
61
62
|
should "Strip all tags" do
|
@@ -63,7 +64,7 @@ HTML
|
|
63
64
|
<p>the cat sat on the mat</p>
|
64
65
|
HTML
|
65
66
|
result = Jkl::Text::strip_all_tags input
|
66
|
-
assert result == "the cat sat on the mat\n"
|
67
|
+
assert result == "the cat sat on the mat\n"
|
67
68
|
end
|
68
69
|
|
69
70
|
should "Clean text" do
|
@@ -76,7 +77,19 @@ the cat sat on the mat
|
|
76
77
|
some end stuff here
|
77
78
|
HTML
|
78
79
|
result = Jkl::Text::plain_text(input)
|
79
|
-
assert result == "the cat sat on the mat"
|
80
|
+
assert result == "the cat sat on the mat\n"
|
81
|
+
end
|
82
|
+
|
83
|
+
should "Remove HTML escaped characters" do
|
84
|
+
input = <<HTML
|
85
|
+
Testing, testing, one two three.
|
86
|
+
<p><strong>The cat didn't sit on the mat</strong></p>
|
87
|
+
HTML
|
88
|
+
expected = <<EXPECTED
|
89
|
+
Testing, testing, one two three.
|
90
|
+
The cat didn't sit on the mat
|
91
|
+
EXPECTED
|
92
|
+
assert_equal expected, Jkl::Text::plain_text(input, 2)
|
80
93
|
end
|
81
94
|
end
|
82
95
|
end
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 2
|
8
|
-
-
|
9
|
-
version: 0.2.
|
8
|
+
- 1
|
9
|
+
version: 0.2.1
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- sshingler
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-
|
17
|
+
date: 2011-06-02 00:00:00 +00:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -63,18 +63,18 @@ dependencies:
|
|
63
63
|
type: :runtime
|
64
64
|
version_requirements: *id003
|
65
65
|
- !ruby/object:Gem::Dependency
|
66
|
-
name:
|
66
|
+
name: term-extract
|
67
67
|
prerelease: false
|
68
68
|
requirement: &id004 !ruby/object:Gem::Requirement
|
69
69
|
none: false
|
70
70
|
requirements:
|
71
|
-
- -
|
71
|
+
- - ~>
|
72
72
|
- !ruby/object:Gem::Version
|
73
73
|
segments:
|
74
74
|
- 0
|
75
|
-
-
|
76
|
-
-
|
77
|
-
version: 0.
|
75
|
+
- 5
|
76
|
+
- 1
|
77
|
+
version: 0.5.1
|
78
78
|
type: :runtime
|
79
79
|
version_requirements: *id004
|
80
80
|
description: Jakal is a Ruby library for tagging keywords from web pages.
|
@@ -92,9 +92,17 @@ files:
|
|
92
92
|
- License.txt
|
93
93
|
- README.md
|
94
94
|
- Rakefile
|
95
|
+
- bin/convert_to_should_syntax
|
96
|
+
- bin/edit_json.rb
|
97
|
+
- bin/minitar
|
98
|
+
- bin/nokogiri
|
99
|
+
- bin/prettify_json.rb
|
100
|
+
- bin/rake
|
101
|
+
- bin/rdebug
|
102
|
+
- bin/term-extract
|
103
|
+
- bin/testrb
|
95
104
|
- jkl.gemspec
|
96
105
|
- lib/jkl.rb
|
97
|
-
- lib/jkl/calais_client.rb
|
98
106
|
- lib/jkl/rss_client.rb
|
99
107
|
- lib/jkl/text_client.rb
|
100
108
|
- test/fixtures/bbc_story.html
|
data/lib/jkl/calais_client.rb
DELETED
@@ -1,28 +0,0 @@
|
|
1
|
-
require "calais"
|
2
|
-
|
3
|
-
module Jkl
|
4
|
-
module Extraction
|
5
|
-
class << self
|
6
|
-
|
7
|
-
def calais_response(key, text)
|
8
|
-
Calais.process_document(
|
9
|
-
:content => text,
|
10
|
-
:license_id => key
|
11
|
-
)
|
12
|
-
end
|
13
|
-
|
14
|
-
def entities(key,text)
|
15
|
-
calais_response(key, text).entities.map{|e| {e.type => [e.attributes["name"]]}}
|
16
|
-
end
|
17
|
-
|
18
|
-
def tags(key, text)
|
19
|
-
nested_list = {}
|
20
|
-
entities(key,text).each do |a|
|
21
|
-
nested_list = nested_list.merge!(a){ |key,v1,v2| v1+v2 }
|
22
|
-
end
|
23
|
-
nested_list
|
24
|
-
end
|
25
|
-
|
26
|
-
end
|
27
|
-
end
|
28
|
-
end
|