jakal 0.1.96 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +7 -0
- data/Gemfile +13 -0
- data/Gemfile.lock +57 -0
- data/Rakefile +7 -0
- data/jkl.gemspec +19 -0
- data/lib/jkl/rss_client.rb +4 -16
- data/lib/jkl/text_client.rb +2 -3
- data/lib/jkl.rb +10 -20
- data/test/unit/jkl_test.rb +39 -61
- data/test/unit/text_cleaning_test.rb +1 -1
- metadata +36 -32
- data/config/config.yml +0 -4
- data/lib/jkl/rest_client.rb +0 -49
data/.gitignore
ADDED
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
jakal (0.2.0)
|
5
|
+
calais (>= 0.0.11)
|
6
|
+
mechanize (>= 1.0.0)
|
7
|
+
nokogiri (~> 1.4.4)
|
8
|
+
rake (>= 0.8.7)
|
9
|
+
|
10
|
+
GEM
|
11
|
+
remote: http://rubygems.org/
|
12
|
+
specs:
|
13
|
+
addressable (2.2.6)
|
14
|
+
archive-tar-minitar (0.5.2)
|
15
|
+
calais (0.0.11)
|
16
|
+
curb (>= 0.1.4)
|
17
|
+
json (>= 1.1.3)
|
18
|
+
nokogiri (>= 1.3.3)
|
19
|
+
columnize (0.3.2)
|
20
|
+
crack (0.1.8)
|
21
|
+
curb (0.7.15)
|
22
|
+
json (1.5.1)
|
23
|
+
linecache19 (0.5.12)
|
24
|
+
ruby_core_source (>= 0.1.4)
|
25
|
+
mechanize (1.0.0)
|
26
|
+
nokogiri (>= 1.2.1)
|
27
|
+
nokogiri (1.4.4)
|
28
|
+
rake (0.8.7)
|
29
|
+
ruby-debug-base19 (0.11.25)
|
30
|
+
columnize (>= 0.3.1)
|
31
|
+
linecache19 (>= 0.5.11)
|
32
|
+
ruby_core_source (>= 0.1.4)
|
33
|
+
ruby-debug19 (0.11.6)
|
34
|
+
columnize (>= 0.3.1)
|
35
|
+
linecache19 (>= 0.5.11)
|
36
|
+
ruby-debug-base19 (>= 0.11.19)
|
37
|
+
ruby_core_source (0.1.5)
|
38
|
+
archive-tar-minitar (>= 0.5.2)
|
39
|
+
shoulda (2.11.3)
|
40
|
+
test-unit (2.3.0)
|
41
|
+
webmock (1.6.2)
|
42
|
+
addressable (>= 2.2.2)
|
43
|
+
crack (>= 0.1.7)
|
44
|
+
|
45
|
+
PLATFORMS
|
46
|
+
ruby
|
47
|
+
|
48
|
+
DEPENDENCIES
|
49
|
+
calais (>= 0.0.11)
|
50
|
+
jakal!
|
51
|
+
mechanize (>= 1.0.0)
|
52
|
+
nokogiri (~> 1.4.4)
|
53
|
+
rake (>= 0.8.7)
|
54
|
+
ruby-debug19 (= 0.11.6)
|
55
|
+
shoulda (= 2.11.3)
|
56
|
+
test-unit (= 2.3.0)
|
57
|
+
webmock (= 1.6.2)
|
data/Rakefile
ADDED
data/jkl.gemspec
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
Gem::Specification.new do |s|
|
2
|
+
s.name = %q{jakal}
|
3
|
+
s.version = "0.2.0"
|
4
|
+
s.platform = Gem::Platform::RUBY
|
5
|
+
s.authors = ['sshingler']
|
6
|
+
s.homepage = %q{http://github.com/sshingler/jkl}
|
7
|
+
s.description = %q{Jakal is a Ruby library for tagging keywords from web pages.}
|
8
|
+
s.summary = s.description
|
9
|
+
s.email = %q{'shingler@gmail.com'}
|
10
|
+
|
11
|
+
s.require_paths = ["lib"]
|
12
|
+
s.files = `git ls-files`.split("\n")
|
13
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
14
|
+
|
15
|
+
s.add_dependency(%q<rake>, [">= 0.8.7"])
|
16
|
+
s.add_dependency(%q<mechanize>, [">= 1.0.0"])
|
17
|
+
s.add_dependency(%q<nokogiri>, ["~> 1.4.4"])
|
18
|
+
s.add_dependency(%q<calais>, [">= 0.0.11"])
|
19
|
+
end
|
data/lib/jkl/rss_client.rb
CHANGED
@@ -1,27 +1,15 @@
|
|
1
|
-
require 'hpricot'
|
2
1
|
|
3
2
|
module Jkl
|
4
3
|
module Rss
|
5
4
|
class << self
|
6
|
-
|
7
|
-
|
8
|
-
|
5
|
+
def items(rss)
|
6
|
+
rss_doc = Nokogiri::Slop(rss)
|
7
|
+
rss_doc.rss.channel.item
|
9
8
|
end
|
10
9
|
|
11
10
|
def links(items)
|
12
|
-
items.map{|item|
|
13
|
-
end
|
14
|
-
|
15
|
-
def descriptions(items)
|
16
|
-
items.map do |item|
|
17
|
-
attribute_from(item, :description).gsub("<![CDATA[","").gsub("]]>","")
|
18
|
-
end
|
11
|
+
items.map{|item| item.link.inner_html}
|
19
12
|
end
|
20
|
-
|
21
|
-
def attribute_from(item, name)
|
22
|
-
(item/name).inner_html
|
23
|
-
end
|
24
|
-
|
25
13
|
end
|
26
14
|
end
|
27
15
|
end
|
data/lib/jkl/text_client.rb
CHANGED
@@ -2,10 +2,9 @@ module Jkl
|
|
2
2
|
module Text
|
3
3
|
class << self
|
4
4
|
|
5
|
-
def
|
6
|
-
remove_short_lines(strip_all_tags(remove_script_tags(
|
5
|
+
def plain_text(document, words_on_line = 5)
|
6
|
+
remove_short_lines(strip_all_tags(remove_script_tags(document)), words_on_line)
|
7
7
|
end
|
8
|
-
alias :clean :sanitize
|
9
8
|
|
10
9
|
def strip_all_tags(text)
|
11
10
|
text.gsub(/<\/?[^>]*>/, "")
|
data/lib/jkl.rb
CHANGED
@@ -1,32 +1,22 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
require_relative "jkl/rss_client"
|
2
|
+
require_relative "jkl/calais_client"
|
3
|
+
require_relative "jkl/text_client"
|
3
4
|
|
4
|
-
require "
|
5
|
-
require "jkl/rest_client"
|
6
|
-
require "jkl/calais_client"
|
7
|
-
require "jkl/text_client"
|
5
|
+
require "mechanize"
|
8
6
|
|
9
7
|
module Jkl
|
10
8
|
class << self
|
11
9
|
|
10
|
+
def get(url)
|
11
|
+
agent = Mechanize.new
|
12
|
+
agent.get(url).body
|
13
|
+
end
|
14
|
+
|
12
15
|
def links(feed)
|
13
|
-
links = Jkl::Rss::links(Jkl::Rss::items(Jkl::
|
16
|
+
links = Jkl::Rss::links(Jkl::Rss::items(Jkl::get(feed)))
|
14
17
|
links.each do |link|
|
15
18
|
yield link if block_given?
|
16
19
|
end
|
17
20
|
end
|
18
|
-
|
19
|
-
def topix_links(keyphrase, url = "http://www.topix.net/rss/search/article?q=")
|
20
|
-
links("#{url}#{keyphrase}")
|
21
|
-
end
|
22
|
-
|
23
|
-
def tags(key, link)
|
24
|
-
text = Jkl::Text::sanitize(Jkl::get_from(link))
|
25
|
-
Jkl::Extraction::tags(key, text)
|
26
|
-
end
|
27
|
-
|
28
|
-
def trends(url = "http://search.twitter.com/trends.json")
|
29
|
-
JSON.parse(Jkl::get_from(url))["trends"].map{|t| t["name"]}
|
30
|
-
end
|
31
21
|
end
|
32
22
|
end
|
data/test/unit/jkl_test.rb
CHANGED
@@ -2,78 +2,56 @@ require "test/unit"
|
|
2
2
|
require "shoulda"
|
3
3
|
require "webmock/test_unit"
|
4
4
|
require "yaml"
|
5
|
+
|
5
6
|
require_relative "../../lib/jkl"
|
6
7
|
|
7
8
|
class JklTest < Test::Unit::TestCase
|
8
9
|
include WebMock::API
|
9
10
|
|
10
|
-
context "
|
11
|
+
context "for documents, plain text and tags" do
|
11
12
|
setup do
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
trends = Jkl::trends
|
19
|
-
assert trends.length == 10
|
20
|
-
assert trends[0] == "London"
|
13
|
+
@url = "http://www.bbc.co.uk"
|
14
|
+
response = File.read('test/fixtures/bbc_story.html')
|
15
|
+
stub_request(:get, @url).to_return(
|
16
|
+
:status => 200,
|
17
|
+
:body => response,
|
18
|
+
:headers => {'Content-Type' => 'text/html'})
|
21
19
|
end
|
22
20
|
|
23
|
-
should "
|
24
|
-
|
25
|
-
|
26
|
-
assert articles[0] = "http://www.localnews8.com/Global/story.asp?S=10876507"
|
21
|
+
should "Get a document from a URL" do
|
22
|
+
doc = Jkl::get(@url)
|
23
|
+
assert_not_nil doc
|
27
24
|
end
|
28
|
-
|
29
|
-
should "
|
30
|
-
|
31
|
-
text = Jkl::Text::
|
32
|
-
|
25
|
+
|
26
|
+
should "Get the plain text version of a document" do
|
27
|
+
document = Jkl::get(@url)
|
28
|
+
text = Jkl::Text::plain_text(document,2)
|
29
|
+
assert_equal 8884, text.length
|
33
30
|
end
|
34
|
-
|
35
|
-
should "
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
Barack Obama said today that he expects there
|
41
|
-
to be conflict within his new security team after
|
42
|
-
confirming Hillary Clinton as his choice for US Secretary of State."
|
43
|
-
EOF
|
44
|
-
tags = Jkl::Extraction::tags(key, text)
|
45
|
-
assert tags["Person"][0] == "Barack Obama"
|
31
|
+
|
32
|
+
should "Get the keywords from a document" do
|
33
|
+
document = Jkl::get(@url)
|
34
|
+
text = Jkl::Text::plain_text(document,2)
|
35
|
+
tags = Jkl::Extraction::tags(calais_key, text)
|
36
|
+
assert ! tags.empty?
|
46
37
|
end
|
47
38
|
end
|
48
39
|
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
{"name":"Cannes","url":"http://search.twitter.com/search?q=Cannes"},
|
59
|
-
{"name":"Verona","url":"http://search.twitter.com/search?q=Verona"},
|
60
|
-
{"name":"Milan","url":"http://search.twitter.com/search?q=Milan"},
|
61
|
-
{"name":"New York","url":"http://search.twitter.com/search?q=New%20York"},
|
62
|
-
{"name":"Paris","url":"http://search.twitter.com/search?q=Paris"},
|
63
|
-
{"name":"Melbourne","url":"http://search.twitter.com/search?q=Melbourne"}
|
64
|
-
],"as_of":"Sat, 1 Jan 1970 00:00:00 +0000"}
|
65
|
-
EOF
|
66
|
-
stub_request(:get, url).to_return(:body => response)
|
67
|
-
end
|
68
|
-
def stub_topix
|
69
|
-
url = YAML::load_file('config/config.yml')['topix']
|
70
|
-
response = File.read('test/fixtures/topix_rss.xml')
|
71
|
-
stub_request(:get, "#{url}London").to_return(:body => response)
|
72
|
-
end
|
73
|
-
def stub_news_article
|
74
|
-
response = File.read('test/fixtures/bbc_story.html')
|
75
|
-
stub_request(:get, "http://www.localnews8.com/Global/story.asp?S=10876507").to_return(
|
76
|
-
:body => response
|
77
|
-
)
|
40
|
+
context "for RSS" do
|
41
|
+
should "Get links from a feed" do
|
42
|
+
feed = "http://feeds.bbci.co.uk/news/rss.xml"
|
43
|
+
response = File.read('test/fixtures/topix_rss.xml')
|
44
|
+
stub_request(:get, "http://feeds.bbci.co.uk/news/rss.xml").
|
45
|
+
to_return(:status => 200, :body => response, :headers => {})
|
46
|
+
first_link = "http://www.localnews8.com/Global/story.asp?S=10876507"
|
47
|
+
assert_equal first_link, Jkl::links(feed).first
|
48
|
+
end
|
78
49
|
end
|
79
|
-
|
50
|
+
|
51
|
+
private
|
52
|
+
def calais_key
|
53
|
+
keys = "config/keys.yml"
|
54
|
+
raise "READ:::::::: You need to create #{keys} and put your calais credentials in it." unless File.exist?(keys)
|
55
|
+
YAML::load_file(keys)['calais']
|
56
|
+
end
|
57
|
+
end
|
metadata
CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
|
|
4
4
|
prerelease: false
|
5
5
|
segments:
|
6
6
|
- 0
|
7
|
-
-
|
8
|
-
-
|
9
|
-
version: 0.
|
7
|
+
- 2
|
8
|
+
- 0
|
9
|
+
version: 0.2.0
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- sshingler
|
@@ -14,11 +14,11 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date:
|
17
|
+
date: 2011-05-28 00:00:00 +00:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
21
|
-
name:
|
21
|
+
name: rake
|
22
22
|
prerelease: false
|
23
23
|
requirement: &id001 !ruby/object:Gem::Requirement
|
24
24
|
none: false
|
@@ -28,12 +28,12 @@ dependencies:
|
|
28
28
|
segments:
|
29
29
|
- 0
|
30
30
|
- 8
|
31
|
-
-
|
32
|
-
version: 0.8.
|
31
|
+
- 7
|
32
|
+
version: 0.8.7
|
33
33
|
type: :runtime
|
34
34
|
version_requirements: *id001
|
35
35
|
- !ruby/object:Gem::Dependency
|
36
|
-
name:
|
36
|
+
name: mechanize
|
37
37
|
prerelease: false
|
38
38
|
requirement: &id002 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
@@ -42,24 +42,24 @@ dependencies:
|
|
42
42
|
- !ruby/object:Gem::Version
|
43
43
|
segments:
|
44
44
|
- 1
|
45
|
-
-
|
46
|
-
-
|
47
|
-
version: 1.
|
45
|
+
- 0
|
46
|
+
- 0
|
47
|
+
version: 1.0.0
|
48
48
|
type: :runtime
|
49
49
|
version_requirements: *id002
|
50
50
|
- !ruby/object:Gem::Dependency
|
51
|
-
name:
|
51
|
+
name: nokogiri
|
52
52
|
prerelease: false
|
53
53
|
requirement: &id003 !ruby/object:Gem::Requirement
|
54
54
|
none: false
|
55
55
|
requirements:
|
56
|
-
- -
|
56
|
+
- - ~>
|
57
57
|
- !ruby/object:Gem::Version
|
58
58
|
segments:
|
59
59
|
- 1
|
60
60
|
- 4
|
61
|
-
-
|
62
|
-
version: 1.4.
|
61
|
+
- 4
|
62
|
+
version: 1.4.4
|
63
63
|
type: :runtime
|
64
64
|
version_requirements: *id003
|
65
65
|
- !ruby/object:Gem::Dependency
|
@@ -73,40 +73,41 @@ dependencies:
|
|
73
73
|
segments:
|
74
74
|
- 0
|
75
75
|
- 0
|
76
|
-
-
|
77
|
-
version: 0.0.
|
76
|
+
- 11
|
77
|
+
version: 0.0.11
|
78
78
|
type: :runtime
|
79
79
|
version_requirements: *id004
|
80
|
-
description: Jakal is a Ruby library
|
80
|
+
description: Jakal is a Ruby library for tagging keywords from web pages.
|
81
81
|
email: "'shingler@gmail.com'"
|
82
82
|
executables: []
|
83
83
|
|
84
84
|
extensions: []
|
85
85
|
|
86
|
-
extra_rdoc_files:
|
87
|
-
|
88
|
-
- License.txt
|
86
|
+
extra_rdoc_files: []
|
87
|
+
|
89
88
|
files:
|
89
|
+
- .gitignore
|
90
|
+
- Gemfile
|
91
|
+
- Gemfile.lock
|
92
|
+
- License.txt
|
93
|
+
- README.md
|
94
|
+
- Rakefile
|
95
|
+
- jkl.gemspec
|
90
96
|
- lib/jkl.rb
|
91
97
|
- lib/jkl/calais_client.rb
|
92
|
-
- lib/jkl/rest_client.rb
|
93
98
|
- lib/jkl/rss_client.rb
|
94
99
|
- lib/jkl/text_client.rb
|
95
100
|
- test/fixtures/bbc_story.html
|
96
101
|
- test/fixtures/topix_rss.xml
|
97
102
|
- test/unit/jkl_test.rb
|
98
103
|
- test/unit/text_cleaning_test.rb
|
99
|
-
- config/config.yml
|
100
|
-
- README.md
|
101
|
-
- License.txt
|
102
104
|
has_rdoc: true
|
103
105
|
homepage: http://github.com/sshingler/jkl
|
104
106
|
licenses: []
|
105
107
|
|
106
108
|
post_install_message:
|
107
|
-
rdoc_options:
|
108
|
-
|
109
|
-
- --charset=UTF-8
|
109
|
+
rdoc_options: []
|
110
|
+
|
110
111
|
require_paths:
|
111
112
|
- lib
|
112
113
|
required_ruby_version: !ruby/object:Gem::Requirement
|
@@ -130,7 +131,10 @@ requirements: []
|
|
130
131
|
rubyforge_project:
|
131
132
|
rubygems_version: 1.3.7
|
132
133
|
signing_key:
|
133
|
-
specification_version:
|
134
|
-
summary: Jakal is a Ruby library
|
135
|
-
test_files:
|
136
|
-
|
134
|
+
specification_version: 3
|
135
|
+
summary: Jakal is a Ruby library for tagging keywords from web pages.
|
136
|
+
test_files:
|
137
|
+
- test/fixtures/bbc_story.html
|
138
|
+
- test/fixtures/topix_rss.xml
|
139
|
+
- test/unit/jkl_test.rb
|
140
|
+
- test/unit/text_cleaning_test.rb
|
data/config/config.yml
DELETED
data/lib/jkl/rest_client.rb
DELETED
@@ -1,49 +0,0 @@
|
|
1
|
-
require 'net/http'
|
2
|
-
require 'hpricot'
|
3
|
-
|
4
|
-
module Jkl
|
5
|
-
class << self
|
6
|
-
|
7
|
-
def post_to(url, post_args = {})
|
8
|
-
begin
|
9
|
-
uri = URI.parse(url)
|
10
|
-
resp, data = Net::HTTP.post_form(uri, post_args)
|
11
|
-
data
|
12
|
-
rescue URI::InvalidURIError => e
|
13
|
-
puts("WARN: JKL Invalid URI: #{e}")
|
14
|
-
rescue SocketError => e
|
15
|
-
puts("WARN: JKL Could not connect: #{e}")
|
16
|
-
rescue Errno::ECONNREFUSED => e
|
17
|
-
puts("WARN: JKL Connection refused: #{e}")
|
18
|
-
end
|
19
|
-
end
|
20
|
-
|
21
|
-
def get_from(uri)
|
22
|
-
begin
|
23
|
-
response = Net::HTTP.get_response(URI.parse(uri))
|
24
|
-
response.body
|
25
|
-
rescue URI::InvalidURIError => e
|
26
|
-
puts("WARN: JKL Invalid URI: #{e}")
|
27
|
-
rescue SocketError => e
|
28
|
-
puts("WARN: JKL Could not connect: #{e}")
|
29
|
-
rescue Errno::ECONNREFUSED => e
|
30
|
-
puts("WARN: JKL Connection refused: #{e}")
|
31
|
-
end
|
32
|
-
end
|
33
|
-
|
34
|
-
def get_from_over_https(host, path)
|
35
|
-
http = Net::HTTP.new(host, "443")
|
36
|
-
http.use_ssl = true
|
37
|
-
http.get2(path) # returns [status, data]
|
38
|
-
end
|
39
|
-
|
40
|
-
def get_xml_from(uri)
|
41
|
-
Hpricot.XML(get_from(uri))
|
42
|
-
end
|
43
|
-
|
44
|
-
def document_from(text)
|
45
|
-
Hpricot(text)
|
46
|
-
end
|
47
|
-
|
48
|
-
end
|
49
|
-
end
|