jakal 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile.lock +5 -8
- data/README.md +2 -54
- data/bin/convert_to_should_syntax +14 -0
- data/bin/edit_json.rb +14 -0
- data/bin/minitar +14 -0
- data/bin/nokogiri +14 -0
- data/bin/prettify_json.rb +14 -0
- data/bin/rake +14 -0
- data/bin/rdebug +14 -0
- data/bin/term-extract +14 -0
- data/bin/testrb +14 -0
- data/jkl.gemspec +2 -2
- data/lib/jkl.rb +5 -1
- data/lib/jkl/text_client.rb +7 -5
- data/test/unit/jkl_test.rb +8 -9
- data/test/unit/text_cleaning_test.rb +20 -7
- metadata +17 -9
- data/lib/jkl/calais_client.rb +0 -28
data/Gemfile.lock
CHANGED
@@ -2,30 +2,25 @@ PATH
|
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
4
|
jakal (0.2.0)
|
5
|
-
calais (>= 0.0.11)
|
6
5
|
mechanize (>= 1.0.0)
|
7
6
|
nokogiri (~> 1.4.4)
|
8
7
|
rake (>= 0.8.7)
|
8
|
+
term-extract (~> 0.5.1)
|
9
9
|
|
10
10
|
GEM
|
11
11
|
remote: http://rubygems.org/
|
12
12
|
specs:
|
13
13
|
addressable (2.2.6)
|
14
14
|
archive-tar-minitar (0.5.2)
|
15
|
-
calais (0.0.11)
|
16
|
-
curb (>= 0.1.4)
|
17
|
-
json (>= 1.1.3)
|
18
|
-
nokogiri (>= 1.3.3)
|
19
15
|
columnize (0.3.2)
|
20
16
|
crack (0.1.8)
|
21
|
-
curb (0.7.15)
|
22
|
-
json (1.5.1)
|
23
17
|
linecache19 (0.5.12)
|
24
18
|
ruby_core_source (>= 0.1.4)
|
25
19
|
mechanize (1.0.0)
|
26
20
|
nokogiri (>= 1.2.1)
|
27
21
|
nokogiri (1.4.4)
|
28
22
|
rake (0.8.7)
|
23
|
+
rbtagger (0.4.7)
|
29
24
|
ruby-debug-base19 (0.11.25)
|
30
25
|
columnize (>= 0.3.1)
|
31
26
|
linecache19 (>= 0.5.11)
|
@@ -37,6 +32,8 @@ GEM
|
|
37
32
|
ruby_core_source (0.1.5)
|
38
33
|
archive-tar-minitar (>= 0.5.2)
|
39
34
|
shoulda (2.11.3)
|
35
|
+
term-extract (0.5.1)
|
36
|
+
rbtagger
|
40
37
|
test-unit (2.3.0)
|
41
38
|
webmock (1.6.2)
|
42
39
|
addressable (>= 2.2.2)
|
@@ -46,12 +43,12 @@ PLATFORMS
|
|
46
43
|
ruby
|
47
44
|
|
48
45
|
DEPENDENCIES
|
49
|
-
calais (>= 0.0.11)
|
50
46
|
jakal!
|
51
47
|
mechanize (>= 1.0.0)
|
52
48
|
nokogiri (~> 1.4.4)
|
53
49
|
rake (>= 0.8.7)
|
54
50
|
ruby-debug19 (= 0.11.6)
|
55
51
|
shoulda (= 2.11.3)
|
52
|
+
term-extract (~> 0.5.1)
|
56
53
|
test-unit (= 2.3.0)
|
57
54
|
webmock (= 1.6.2)
|
data/README.md
CHANGED
@@ -1,59 +1,7 @@
|
|
1
1
|
# jkl
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
* Connects to URLs.
|
6
|
-
* Gets stuff out of RSS feeds.
|
7
|
-
* Gets the main content from web pages
|
8
|
-
* Gets a set of metadata from a web page (using the calais gem)
|
9
|
-
|
10
|
-
# Sample usage
|
11
|
-
|
12
|
-
For example - if you had a RSS feed:
|
13
|
-
|
14
|
-
require "jkl"
|
15
|
-
|
16
|
-
feed = "http://www.topix.net/rss/search/article?x=0&y=0&q=London"
|
17
|
-
|
18
|
-
You could collect some metadata from the links in that feed, thus:
|
19
|
-
|
20
|
-
tags = []
|
21
|
-
Jkl::links(feed).each do |link|
|
22
|
-
tags << Jkl::tags("my_calais_key",link)
|
23
|
-
end
|
24
|
-
|
25
|
-
A metadata sample might look something like this:
|
26
|
-
|
27
|
-
{
|
28
|
-
"Person"=>["Barack Obama", "Hillary Clinton"],
|
29
|
-
"Position"=>["Secretary of State"]
|
30
|
-
}
|
3
|
+
Jakal is a Ruby library for tagging keywords from web pages.
|
31
4
|
|
32
5
|
It is hosted at [gemcutter](http://gemcutter.org/gems/jakal)
|
33
6
|
|
34
|
-
|
35
|
-
|
36
|
-
# LICENSE:
|
37
|
-
|
38
|
-
(The MIT License)
|
39
|
-
|
40
|
-
Copyright (c) 2009 sshingler
|
41
|
-
|
42
|
-
Permission is hereby granted, free of charge, to any person obtaining
|
43
|
-
a copy of this software and associated documentation files (the
|
44
|
-
'Software'), to deal in the Software without restriction, including
|
45
|
-
without limitation the rights to use, copy, modify, merge, publish,
|
46
|
-
distribute, sublicense, and/or sell copies of the Software, and to
|
47
|
-
permit persons to whom the Software is furnished to do so, subject to
|
48
|
-
the following conditions:
|
49
|
-
|
50
|
-
The above copyright notice and this permission notice shall be
|
51
|
-
included in all copies or substantial portions of the Software.
|
52
|
-
|
53
|
-
THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
|
54
|
-
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
55
|
-
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
56
|
-
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
57
|
-
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
58
|
-
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
59
|
-
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
7
|
+
gem install jakal
|
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file was generated by Bundler.
|
4
|
+
#
|
5
|
+
# The application 'convert_to_should_syntax' is installed as part of a gem, and
|
6
|
+
# this file is here to facilitate running it.
|
7
|
+
#
|
8
|
+
|
9
|
+
ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile", __FILE__)
|
10
|
+
|
11
|
+
require 'rubygems'
|
12
|
+
require 'bundler/setup'
|
13
|
+
|
14
|
+
load Gem.bin_path('shoulda', 'convert_to_should_syntax')
|
data/bin/edit_json.rb
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file was generated by Bundler.
|
4
|
+
#
|
5
|
+
# The application 'edit_json.rb' is installed as part of a gem, and
|
6
|
+
# this file is here to facilitate running it.
|
7
|
+
#
|
8
|
+
|
9
|
+
ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile", __FILE__)
|
10
|
+
|
11
|
+
require 'rubygems'
|
12
|
+
require 'bundler/setup'
|
13
|
+
|
14
|
+
load Gem.bin_path('json', 'edit_json.rb')
|
data/bin/minitar
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file was generated by Bundler.
|
4
|
+
#
|
5
|
+
# The application 'minitar' is installed as part of a gem, and
|
6
|
+
# this file is here to facilitate running it.
|
7
|
+
#
|
8
|
+
|
9
|
+
ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile", __FILE__)
|
10
|
+
|
11
|
+
require 'rubygems'
|
12
|
+
require 'bundler/setup'
|
13
|
+
|
14
|
+
load Gem.bin_path('archive-tar-minitar', 'minitar')
|
data/bin/nokogiri
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file was generated by Bundler.
|
4
|
+
#
|
5
|
+
# The application 'nokogiri' is installed as part of a gem, and
|
6
|
+
# this file is here to facilitate running it.
|
7
|
+
#
|
8
|
+
|
9
|
+
ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile", __FILE__)
|
10
|
+
|
11
|
+
require 'rubygems'
|
12
|
+
require 'bundler/setup'
|
13
|
+
|
14
|
+
load Gem.bin_path('nokogiri', 'nokogiri')
|
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file was generated by Bundler.
|
4
|
+
#
|
5
|
+
# The application 'prettify_json.rb' is installed as part of a gem, and
|
6
|
+
# this file is here to facilitate running it.
|
7
|
+
#
|
8
|
+
|
9
|
+
ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile", __FILE__)
|
10
|
+
|
11
|
+
require 'rubygems'
|
12
|
+
require 'bundler/setup'
|
13
|
+
|
14
|
+
load Gem.bin_path('json', 'prettify_json.rb')
|
data/bin/rake
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file was generated by Bundler.
|
4
|
+
#
|
5
|
+
# The application 'rake' is installed as part of a gem, and
|
6
|
+
# this file is here to facilitate running it.
|
7
|
+
#
|
8
|
+
|
9
|
+
ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile", __FILE__)
|
10
|
+
|
11
|
+
require 'rubygems'
|
12
|
+
require 'bundler/setup'
|
13
|
+
|
14
|
+
load Gem.bin_path('rake', 'rake')
|
data/bin/rdebug
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file was generated by Bundler.
|
4
|
+
#
|
5
|
+
# The application 'rdebug' is installed as part of a gem, and
|
6
|
+
# this file is here to facilitate running it.
|
7
|
+
#
|
8
|
+
|
9
|
+
ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile", __FILE__)
|
10
|
+
|
11
|
+
require 'rubygems'
|
12
|
+
require 'bundler/setup'
|
13
|
+
|
14
|
+
load Gem.bin_path('ruby-debug19', 'rdebug')
|
data/bin/term-extract
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file was generated by Bundler.
|
4
|
+
#
|
5
|
+
# The application 'term-extract' is installed as part of a gem, and
|
6
|
+
# this file is here to facilitate running it.
|
7
|
+
#
|
8
|
+
|
9
|
+
ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile", __FILE__)
|
10
|
+
|
11
|
+
require 'rubygems'
|
12
|
+
require 'bundler/setup'
|
13
|
+
|
14
|
+
load Gem.bin_path('term-extract', 'term-extract')
|
data/bin/testrb
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file was generated by Bundler.
|
4
|
+
#
|
5
|
+
# The application 'testrb' is installed as part of a gem, and
|
6
|
+
# this file is here to facilitate running it.
|
7
|
+
#
|
8
|
+
|
9
|
+
ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile", __FILE__)
|
10
|
+
|
11
|
+
require 'rubygems'
|
12
|
+
require 'bundler/setup'
|
13
|
+
|
14
|
+
load Gem.bin_path('test-unit', 'testrb')
|
data/jkl.gemspec
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
Gem::Specification.new do |s|
|
2
2
|
s.name = %q{jakal}
|
3
|
-
s.version = "0.2.
|
3
|
+
s.version = "0.2.1"
|
4
4
|
s.platform = Gem::Platform::RUBY
|
5
5
|
s.authors = ['sshingler']
|
6
6
|
s.homepage = %q{http://github.com/sshingler/jkl}
|
@@ -15,5 +15,5 @@ Gem::Specification.new do |s|
|
|
15
15
|
s.add_dependency(%q<rake>, [">= 0.8.7"])
|
16
16
|
s.add_dependency(%q<mechanize>, [">= 1.0.0"])
|
17
17
|
s.add_dependency(%q<nokogiri>, ["~> 1.4.4"])
|
18
|
-
s.add_dependency(%q<
|
18
|
+
s.add_dependency(%q<term-extract>, ["~> 0.5.1"])
|
19
19
|
end
|
data/lib/jkl.rb
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
require_relative "jkl/rss_client"
|
2
|
-
require_relative "jkl/calais_client"
|
3
2
|
require_relative "jkl/text_client"
|
4
3
|
|
5
4
|
require "mechanize"
|
5
|
+
require "term-extract"
|
6
6
|
|
7
7
|
module Jkl
|
8
8
|
class << self
|
@@ -18,5 +18,9 @@ module Jkl
|
|
18
18
|
yield link if block_given?
|
19
19
|
end
|
20
20
|
end
|
21
|
+
|
22
|
+
def tags(text)
|
23
|
+
TermExtract.extract(text.force_encoding("UTF-8")).keys
|
24
|
+
end
|
21
25
|
end
|
22
26
|
end
|
data/lib/jkl/text_client.rb
CHANGED
@@ -3,8 +3,12 @@ module Jkl
|
|
3
3
|
class << self
|
4
4
|
|
5
5
|
def plain_text(document, words_on_line = 5)
|
6
|
-
|
6
|
+
CGI::unescapeHTML(
|
7
|
+
remove_short_lines(
|
8
|
+
strip_all_tags(
|
9
|
+
remove_script_tags(document)), words_on_line))
|
7
10
|
end
|
11
|
+
alias :sanitize :plain_text
|
8
12
|
|
9
13
|
def strip_all_tags(text)
|
10
14
|
text.gsub(/<\/?[^>]*>/, "")
|
@@ -26,13 +30,11 @@ module Jkl
|
|
26
30
|
def remove_short_lines(text, words_on_line = 5)
|
27
31
|
text = text.gsub(/\s\s/, "\n")
|
28
32
|
str = ""
|
29
|
-
|
30
|
-
|
31
|
-
str << l unless l.count(" ") < words_on_line
|
33
|
+
text.split("\n").each do |line|
|
34
|
+
str << "#{line}\n" unless line.count(" ") < words_on_line
|
32
35
|
end
|
33
36
|
str
|
34
37
|
end
|
35
|
-
|
36
38
|
end
|
37
39
|
end
|
38
40
|
end
|
data/test/unit/jkl_test.rb
CHANGED
@@ -2,13 +2,12 @@ require "test/unit"
|
|
2
2
|
require "shoulda"
|
3
3
|
require "webmock/test_unit"
|
4
4
|
require "yaml"
|
5
|
-
|
6
5
|
require_relative "../../lib/jkl"
|
7
6
|
|
8
7
|
class JklTest < Test::Unit::TestCase
|
9
8
|
include WebMock::API
|
10
9
|
|
11
|
-
context "
|
10
|
+
context "Jkl: When handling documents, plain text and tags" do
|
12
11
|
setup do
|
13
12
|
@url = "http://www.bbc.co.uk"
|
14
13
|
response = File.read('test/fixtures/bbc_story.html')
|
@@ -18,27 +17,27 @@ class JklTest < Test::Unit::TestCase
|
|
18
17
|
:headers => {'Content-Type' => 'text/html'})
|
19
18
|
end
|
20
19
|
|
21
|
-
should "
|
20
|
+
should "get a document from a URL" do
|
22
21
|
doc = Jkl::get(@url)
|
23
22
|
assert_not_nil doc
|
24
23
|
end
|
25
24
|
|
26
|
-
should "
|
25
|
+
should "get the plain text version of a document" do
|
27
26
|
document = Jkl::get(@url)
|
28
27
|
text = Jkl::Text::plain_text(document,2)
|
29
|
-
assert_equal
|
28
|
+
assert_equal 9009, text.length
|
30
29
|
end
|
31
30
|
|
32
|
-
should "
|
31
|
+
should "get the keywords from a document" do
|
33
32
|
document = Jkl::get(@url)
|
34
33
|
text = Jkl::Text::plain_text(document,2)
|
35
|
-
tags = Jkl::
|
34
|
+
tags = Jkl::tags(text)
|
36
35
|
assert ! tags.empty?
|
37
36
|
end
|
38
37
|
end
|
39
38
|
|
40
|
-
context "
|
41
|
-
should "
|
39
|
+
context "Jkl: When handling RSS" do
|
40
|
+
should "get links from a feed" do
|
42
41
|
feed = "http://feeds.bbci.co.uk/news/rss.xml"
|
43
42
|
response = File.read('test/fixtures/topix_rss.xml')
|
44
43
|
stub_request(:get, "http://feeds.bbci.co.uk/news/rss.xml").
|
@@ -13,7 +13,7 @@ the cat sat on the mat
|
|
13
13
|
a short line
|
14
14
|
HTML
|
15
15
|
result = Jkl::Text::remove_short_lines input
|
16
|
-
assert result == "the cat sat on the mat"
|
16
|
+
assert result == "the cat sat on the mat\n"
|
17
17
|
end
|
18
18
|
|
19
19
|
should "Remove shorter lines" do
|
@@ -23,7 +23,8 @@ the cat sat on the slightly fluffy, yet worn and homely mat
|
|
23
23
|
a short line
|
24
24
|
HTML
|
25
25
|
result = Jkl::Text::remove_short_lines(input, 8)
|
26
|
-
|
26
|
+
expected = "the cat sat on the slightly fluffy, yet worn and homely mat\n"
|
27
|
+
assert_equal expected, result
|
27
28
|
end
|
28
29
|
|
29
30
|
should "Remove script tags" do
|
@@ -35,7 +36,7 @@ function nofunction(){var bob;}
|
|
35
36
|
a short line
|
36
37
|
HTML
|
37
38
|
result = Jkl::Text::remove_short_lines input
|
38
|
-
assert result == "the cat sat on the mat"
|
39
|
+
assert result == "the cat sat on the mat\n"
|
39
40
|
end
|
40
41
|
|
41
42
|
should "Remove html comments" do
|
@@ -45,7 +46,7 @@ the cat sat on the mat
|
|
45
46
|
a short line
|
46
47
|
HTML
|
47
48
|
result = Jkl::Text::remove_short_lines input
|
48
|
-
assert result == "the cat sat on the mat"
|
49
|
+
assert result == "the cat sat on the mat\n"
|
49
50
|
end
|
50
51
|
|
51
52
|
should "Remove blank lines" do
|
@@ -55,7 +56,7 @@ the cat sat on the mat
|
|
55
56
|
a short line
|
56
57
|
HTML
|
57
58
|
result = Jkl::Text::remove_short_lines input
|
58
|
-
assert result == "the cat sat on the mat"
|
59
|
+
assert result == "the cat sat on the mat\n"
|
59
60
|
end
|
60
61
|
|
61
62
|
should "Strip all tags" do
|
@@ -63,7 +64,7 @@ HTML
|
|
63
64
|
<p>the cat sat on the mat</p>
|
64
65
|
HTML
|
65
66
|
result = Jkl::Text::strip_all_tags input
|
66
|
-
assert result == "the cat sat on the mat\n"
|
67
|
+
assert result == "the cat sat on the mat\n"
|
67
68
|
end
|
68
69
|
|
69
70
|
should "Clean text" do
|
@@ -76,7 +77,19 @@ the cat sat on the mat
|
|
76
77
|
some end stuff here
|
77
78
|
HTML
|
78
79
|
result = Jkl::Text::plain_text(input)
|
79
|
-
assert result == "the cat sat on the mat"
|
80
|
+
assert result == "the cat sat on the mat\n"
|
81
|
+
end
|
82
|
+
|
83
|
+
should "Remove HTML escaped characters" do
|
84
|
+
input = <<HTML
|
85
|
+
Testing, testing, one two three.
|
86
|
+
<p><strong>The cat didn't sit on the mat</strong></p>
|
87
|
+
HTML
|
88
|
+
expected = <<EXPECTED
|
89
|
+
Testing, testing, one two three.
|
90
|
+
The cat didn't sit on the mat
|
91
|
+
EXPECTED
|
92
|
+
assert_equal expected, Jkl::Text::plain_text(input, 2)
|
80
93
|
end
|
81
94
|
end
|
82
95
|
end
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 2
|
8
|
-
-
|
9
|
-
version: 0.2.
|
8
|
+
- 1
|
9
|
+
version: 0.2.1
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- sshingler
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-
|
17
|
+
date: 2011-06-02 00:00:00 +00:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -63,18 +63,18 @@ dependencies:
|
|
63
63
|
type: :runtime
|
64
64
|
version_requirements: *id003
|
65
65
|
- !ruby/object:Gem::Dependency
|
66
|
-
name:
|
66
|
+
name: term-extract
|
67
67
|
prerelease: false
|
68
68
|
requirement: &id004 !ruby/object:Gem::Requirement
|
69
69
|
none: false
|
70
70
|
requirements:
|
71
|
-
- -
|
71
|
+
- - ~>
|
72
72
|
- !ruby/object:Gem::Version
|
73
73
|
segments:
|
74
74
|
- 0
|
75
|
-
-
|
76
|
-
-
|
77
|
-
version: 0.
|
75
|
+
- 5
|
76
|
+
- 1
|
77
|
+
version: 0.5.1
|
78
78
|
type: :runtime
|
79
79
|
version_requirements: *id004
|
80
80
|
description: Jakal is a Ruby library for tagging keywords from web pages.
|
@@ -92,9 +92,17 @@ files:
|
|
92
92
|
- License.txt
|
93
93
|
- README.md
|
94
94
|
- Rakefile
|
95
|
+
- bin/convert_to_should_syntax
|
96
|
+
- bin/edit_json.rb
|
97
|
+
- bin/minitar
|
98
|
+
- bin/nokogiri
|
99
|
+
- bin/prettify_json.rb
|
100
|
+
- bin/rake
|
101
|
+
- bin/rdebug
|
102
|
+
- bin/term-extract
|
103
|
+
- bin/testrb
|
95
104
|
- jkl.gemspec
|
96
105
|
- lib/jkl.rb
|
97
|
-
- lib/jkl/calais_client.rb
|
98
106
|
- lib/jkl/rss_client.rb
|
99
107
|
- lib/jkl/text_client.rb
|
100
108
|
- test/fixtures/bbc_story.html
|
data/lib/jkl/calais_client.rb
DELETED
@@ -1,28 +0,0 @@
|
|
1
|
-
require "calais"
|
2
|
-
|
3
|
-
module Jkl
|
4
|
-
module Extraction
|
5
|
-
class << self
|
6
|
-
|
7
|
-
def calais_response(key, text)
|
8
|
-
Calais.process_document(
|
9
|
-
:content => text,
|
10
|
-
:license_id => key
|
11
|
-
)
|
12
|
-
end
|
13
|
-
|
14
|
-
def entities(key,text)
|
15
|
-
calais_response(key, text).entities.map{|e| {e.type => [e.attributes["name"]]}}
|
16
|
-
end
|
17
|
-
|
18
|
-
def tags(key, text)
|
19
|
-
nested_list = {}
|
20
|
-
entities(key,text).each do |a|
|
21
|
-
nested_list = nested_list.merge!(a){ |key,v1,v2| v1+v2 }
|
22
|
-
end
|
23
|
-
nested_list
|
24
|
-
end
|
25
|
-
|
26
|
-
end
|
27
|
-
end
|
28
|
-
end
|