jakal 0.1.96 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +7 -0
- data/Gemfile +13 -0
- data/Gemfile.lock +57 -0
- data/Rakefile +7 -0
- data/jkl.gemspec +19 -0
- data/lib/jkl/rss_client.rb +4 -16
- data/lib/jkl/text_client.rb +2 -3
- data/lib/jkl.rb +10 -20
- data/test/unit/jkl_test.rb +39 -61
- data/test/unit/text_cleaning_test.rb +1 -1
- metadata +36 -32
- data/config/config.yml +0 -4
- data/lib/jkl/rest_client.rb +0 -49
data/.gitignore
ADDED
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
jakal (0.2.0)
|
5
|
+
calais (>= 0.0.11)
|
6
|
+
mechanize (>= 1.0.0)
|
7
|
+
nokogiri (~> 1.4.4)
|
8
|
+
rake (>= 0.8.7)
|
9
|
+
|
10
|
+
GEM
|
11
|
+
remote: http://rubygems.org/
|
12
|
+
specs:
|
13
|
+
addressable (2.2.6)
|
14
|
+
archive-tar-minitar (0.5.2)
|
15
|
+
calais (0.0.11)
|
16
|
+
curb (>= 0.1.4)
|
17
|
+
json (>= 1.1.3)
|
18
|
+
nokogiri (>= 1.3.3)
|
19
|
+
columnize (0.3.2)
|
20
|
+
crack (0.1.8)
|
21
|
+
curb (0.7.15)
|
22
|
+
json (1.5.1)
|
23
|
+
linecache19 (0.5.12)
|
24
|
+
ruby_core_source (>= 0.1.4)
|
25
|
+
mechanize (1.0.0)
|
26
|
+
nokogiri (>= 1.2.1)
|
27
|
+
nokogiri (1.4.4)
|
28
|
+
rake (0.8.7)
|
29
|
+
ruby-debug-base19 (0.11.25)
|
30
|
+
columnize (>= 0.3.1)
|
31
|
+
linecache19 (>= 0.5.11)
|
32
|
+
ruby_core_source (>= 0.1.4)
|
33
|
+
ruby-debug19 (0.11.6)
|
34
|
+
columnize (>= 0.3.1)
|
35
|
+
linecache19 (>= 0.5.11)
|
36
|
+
ruby-debug-base19 (>= 0.11.19)
|
37
|
+
ruby_core_source (0.1.5)
|
38
|
+
archive-tar-minitar (>= 0.5.2)
|
39
|
+
shoulda (2.11.3)
|
40
|
+
test-unit (2.3.0)
|
41
|
+
webmock (1.6.2)
|
42
|
+
addressable (>= 2.2.2)
|
43
|
+
crack (>= 0.1.7)
|
44
|
+
|
45
|
+
PLATFORMS
|
46
|
+
ruby
|
47
|
+
|
48
|
+
DEPENDENCIES
|
49
|
+
calais (>= 0.0.11)
|
50
|
+
jakal!
|
51
|
+
mechanize (>= 1.0.0)
|
52
|
+
nokogiri (~> 1.4.4)
|
53
|
+
rake (>= 0.8.7)
|
54
|
+
ruby-debug19 (= 0.11.6)
|
55
|
+
shoulda (= 2.11.3)
|
56
|
+
test-unit (= 2.3.0)
|
57
|
+
webmock (= 1.6.2)
|
data/Rakefile
ADDED
data/jkl.gemspec
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
Gem::Specification.new do |s|
|
2
|
+
s.name = %q{jakal}
|
3
|
+
s.version = "0.2.0"
|
4
|
+
s.platform = Gem::Platform::RUBY
|
5
|
+
s.authors = ['sshingler']
|
6
|
+
s.homepage = %q{http://github.com/sshingler/jkl}
|
7
|
+
s.description = %q{Jakal is a Ruby library for tagging keywords from web pages.}
|
8
|
+
s.summary = s.description
|
9
|
+
s.email = %q{'shingler@gmail.com'}
|
10
|
+
|
11
|
+
s.require_paths = ["lib"]
|
12
|
+
s.files = `git ls-files`.split("\n")
|
13
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
14
|
+
|
15
|
+
s.add_dependency(%q<rake>, [">= 0.8.7"])
|
16
|
+
s.add_dependency(%q<mechanize>, [">= 1.0.0"])
|
17
|
+
s.add_dependency(%q<nokogiri>, ["~> 1.4.4"])
|
18
|
+
s.add_dependency(%q<calais>, [">= 0.0.11"])
|
19
|
+
end
|
data/lib/jkl/rss_client.rb
CHANGED
@@ -1,27 +1,15 @@
|
|
1
|
-
require 'hpricot'
|
2
1
|
|
3
2
|
module Jkl
|
4
3
|
module Rss
|
5
4
|
class << self
|
6
|
-
|
7
|
-
|
8
|
-
|
5
|
+
def items(rss)
|
6
|
+
rss_doc = Nokogiri::Slop(rss)
|
7
|
+
rss_doc.rss.channel.item
|
9
8
|
end
|
10
9
|
|
11
10
|
def links(items)
|
12
|
-
items.map{|item|
|
13
|
-
end
|
14
|
-
|
15
|
-
def descriptions(items)
|
16
|
-
items.map do |item|
|
17
|
-
attribute_from(item, :description).gsub("<![CDATA[","").gsub("]]>","")
|
18
|
-
end
|
11
|
+
items.map{|item| item.link.inner_html}
|
19
12
|
end
|
20
|
-
|
21
|
-
def attribute_from(item, name)
|
22
|
-
(item/name).inner_html
|
23
|
-
end
|
24
|
-
|
25
13
|
end
|
26
14
|
end
|
27
15
|
end
|
data/lib/jkl/text_client.rb
CHANGED
@@ -2,10 +2,9 @@ module Jkl
|
|
2
2
|
module Text
|
3
3
|
class << self
|
4
4
|
|
5
|
-
def
|
6
|
-
remove_short_lines(strip_all_tags(remove_script_tags(
|
5
|
+
def plain_text(document, words_on_line = 5)
|
6
|
+
remove_short_lines(strip_all_tags(remove_script_tags(document)), words_on_line)
|
7
7
|
end
|
8
|
-
alias :clean :sanitize
|
9
8
|
|
10
9
|
def strip_all_tags(text)
|
11
10
|
text.gsub(/<\/?[^>]*>/, "")
|
data/lib/jkl.rb
CHANGED
@@ -1,32 +1,22 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
require_relative "jkl/rss_client"
|
2
|
+
require_relative "jkl/calais_client"
|
3
|
+
require_relative "jkl/text_client"
|
3
4
|
|
4
|
-
require "
|
5
|
-
require "jkl/rest_client"
|
6
|
-
require "jkl/calais_client"
|
7
|
-
require "jkl/text_client"
|
5
|
+
require "mechanize"
|
8
6
|
|
9
7
|
module Jkl
|
10
8
|
class << self
|
11
9
|
|
10
|
+
def get(url)
|
11
|
+
agent = Mechanize.new
|
12
|
+
agent.get(url).body
|
13
|
+
end
|
14
|
+
|
12
15
|
def links(feed)
|
13
|
-
links = Jkl::Rss::links(Jkl::Rss::items(Jkl::
|
16
|
+
links = Jkl::Rss::links(Jkl::Rss::items(Jkl::get(feed)))
|
14
17
|
links.each do |link|
|
15
18
|
yield link if block_given?
|
16
19
|
end
|
17
20
|
end
|
18
|
-
|
19
|
-
def topix_links(keyphrase, url = "http://www.topix.net/rss/search/article?q=")
|
20
|
-
links("#{url}#{keyphrase}")
|
21
|
-
end
|
22
|
-
|
23
|
-
def tags(key, link)
|
24
|
-
text = Jkl::Text::sanitize(Jkl::get_from(link))
|
25
|
-
Jkl::Extraction::tags(key, text)
|
26
|
-
end
|
27
|
-
|
28
|
-
def trends(url = "http://search.twitter.com/trends.json")
|
29
|
-
JSON.parse(Jkl::get_from(url))["trends"].map{|t| t["name"]}
|
30
|
-
end
|
31
21
|
end
|
32
22
|
end
|
data/test/unit/jkl_test.rb
CHANGED
@@ -2,78 +2,56 @@ require "test/unit"
|
|
2
2
|
require "shoulda"
|
3
3
|
require "webmock/test_unit"
|
4
4
|
require "yaml"
|
5
|
+
|
5
6
|
require_relative "../../lib/jkl"
|
6
7
|
|
7
8
|
class JklTest < Test::Unit::TestCase
|
8
9
|
include WebMock::API
|
9
10
|
|
10
|
-
context "
|
11
|
+
context "for documents, plain text and tags" do
|
11
12
|
setup do
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
trends = Jkl::trends
|
19
|
-
assert trends.length == 10
|
20
|
-
assert trends[0] == "London"
|
13
|
+
@url = "http://www.bbc.co.uk"
|
14
|
+
response = File.read('test/fixtures/bbc_story.html')
|
15
|
+
stub_request(:get, @url).to_return(
|
16
|
+
:status => 200,
|
17
|
+
:body => response,
|
18
|
+
:headers => {'Content-Type' => 'text/html'})
|
21
19
|
end
|
22
20
|
|
23
|
-
should "
|
24
|
-
|
25
|
-
|
26
|
-
assert articles[0] = "http://www.localnews8.com/Global/story.asp?S=10876507"
|
21
|
+
should "Get a document from a URL" do
|
22
|
+
doc = Jkl::get(@url)
|
23
|
+
assert_not_nil doc
|
27
24
|
end
|
28
|
-
|
29
|
-
should "
|
30
|
-
|
31
|
-
text = Jkl::Text::
|
32
|
-
|
25
|
+
|
26
|
+
should "Get the plain text version of a document" do
|
27
|
+
document = Jkl::get(@url)
|
28
|
+
text = Jkl::Text::plain_text(document,2)
|
29
|
+
assert_equal 8884, text.length
|
33
30
|
end
|
34
|
-
|
35
|
-
should "
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
Barack Obama said today that he expects there
|
41
|
-
to be conflict within his new security team after
|
42
|
-
confirming Hillary Clinton as his choice for US Secretary of State."
|
43
|
-
EOF
|
44
|
-
tags = Jkl::Extraction::tags(key, text)
|
45
|
-
assert tags["Person"][0] == "Barack Obama"
|
31
|
+
|
32
|
+
should "Get the keywords from a document" do
|
33
|
+
document = Jkl::get(@url)
|
34
|
+
text = Jkl::Text::plain_text(document,2)
|
35
|
+
tags = Jkl::Extraction::tags(calais_key, text)
|
36
|
+
assert ! tags.empty?
|
46
37
|
end
|
47
38
|
end
|
48
39
|
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
{"name":"Cannes","url":"http://search.twitter.com/search?q=Cannes"},
|
59
|
-
{"name":"Verona","url":"http://search.twitter.com/search?q=Verona"},
|
60
|
-
{"name":"Milan","url":"http://search.twitter.com/search?q=Milan"},
|
61
|
-
{"name":"New York","url":"http://search.twitter.com/search?q=New%20York"},
|
62
|
-
{"name":"Paris","url":"http://search.twitter.com/search?q=Paris"},
|
63
|
-
{"name":"Melbourne","url":"http://search.twitter.com/search?q=Melbourne"}
|
64
|
-
],"as_of":"Sat, 1 Jan 1970 00:00:00 +0000"}
|
65
|
-
EOF
|
66
|
-
stub_request(:get, url).to_return(:body => response)
|
67
|
-
end
|
68
|
-
def stub_topix
|
69
|
-
url = YAML::load_file('config/config.yml')['topix']
|
70
|
-
response = File.read('test/fixtures/topix_rss.xml')
|
71
|
-
stub_request(:get, "#{url}London").to_return(:body => response)
|
72
|
-
end
|
73
|
-
def stub_news_article
|
74
|
-
response = File.read('test/fixtures/bbc_story.html')
|
75
|
-
stub_request(:get, "http://www.localnews8.com/Global/story.asp?S=10876507").to_return(
|
76
|
-
:body => response
|
77
|
-
)
|
40
|
+
context "for RSS" do
|
41
|
+
should "Get links from a feed" do
|
42
|
+
feed = "http://feeds.bbci.co.uk/news/rss.xml"
|
43
|
+
response = File.read('test/fixtures/topix_rss.xml')
|
44
|
+
stub_request(:get, "http://feeds.bbci.co.uk/news/rss.xml").
|
45
|
+
to_return(:status => 200, :body => response, :headers => {})
|
46
|
+
first_link = "http://www.localnews8.com/Global/story.asp?S=10876507"
|
47
|
+
assert_equal first_link, Jkl::links(feed).first
|
48
|
+
end
|
78
49
|
end
|
79
|
-
|
50
|
+
|
51
|
+
private
|
52
|
+
def calais_key
|
53
|
+
keys = "config/keys.yml"
|
54
|
+
raise "READ:::::::: You need to create #{keys} and put your calais credentials in it." unless File.exist?(keys)
|
55
|
+
YAML::load_file(keys)['calais']
|
56
|
+
end
|
57
|
+
end
|
metadata
CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
|
|
4
4
|
prerelease: false
|
5
5
|
segments:
|
6
6
|
- 0
|
7
|
-
-
|
8
|
-
-
|
9
|
-
version: 0.
|
7
|
+
- 2
|
8
|
+
- 0
|
9
|
+
version: 0.2.0
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- sshingler
|
@@ -14,11 +14,11 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date:
|
17
|
+
date: 2011-05-28 00:00:00 +00:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
21
|
-
name:
|
21
|
+
name: rake
|
22
22
|
prerelease: false
|
23
23
|
requirement: &id001 !ruby/object:Gem::Requirement
|
24
24
|
none: false
|
@@ -28,12 +28,12 @@ dependencies:
|
|
28
28
|
segments:
|
29
29
|
- 0
|
30
30
|
- 8
|
31
|
-
-
|
32
|
-
version: 0.8.
|
31
|
+
- 7
|
32
|
+
version: 0.8.7
|
33
33
|
type: :runtime
|
34
34
|
version_requirements: *id001
|
35
35
|
- !ruby/object:Gem::Dependency
|
36
|
-
name:
|
36
|
+
name: mechanize
|
37
37
|
prerelease: false
|
38
38
|
requirement: &id002 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
@@ -42,24 +42,24 @@ dependencies:
|
|
42
42
|
- !ruby/object:Gem::Version
|
43
43
|
segments:
|
44
44
|
- 1
|
45
|
-
-
|
46
|
-
-
|
47
|
-
version: 1.
|
45
|
+
- 0
|
46
|
+
- 0
|
47
|
+
version: 1.0.0
|
48
48
|
type: :runtime
|
49
49
|
version_requirements: *id002
|
50
50
|
- !ruby/object:Gem::Dependency
|
51
|
-
name:
|
51
|
+
name: nokogiri
|
52
52
|
prerelease: false
|
53
53
|
requirement: &id003 !ruby/object:Gem::Requirement
|
54
54
|
none: false
|
55
55
|
requirements:
|
56
|
-
- -
|
56
|
+
- - ~>
|
57
57
|
- !ruby/object:Gem::Version
|
58
58
|
segments:
|
59
59
|
- 1
|
60
60
|
- 4
|
61
|
-
-
|
62
|
-
version: 1.4.
|
61
|
+
- 4
|
62
|
+
version: 1.4.4
|
63
63
|
type: :runtime
|
64
64
|
version_requirements: *id003
|
65
65
|
- !ruby/object:Gem::Dependency
|
@@ -73,40 +73,41 @@ dependencies:
|
|
73
73
|
segments:
|
74
74
|
- 0
|
75
75
|
- 0
|
76
|
-
-
|
77
|
-
version: 0.0.
|
76
|
+
- 11
|
77
|
+
version: 0.0.11
|
78
78
|
type: :runtime
|
79
79
|
version_requirements: *id004
|
80
|
-
description: Jakal is a Ruby library
|
80
|
+
description: Jakal is a Ruby library for tagging keywords from web pages.
|
81
81
|
email: "'shingler@gmail.com'"
|
82
82
|
executables: []
|
83
83
|
|
84
84
|
extensions: []
|
85
85
|
|
86
|
-
extra_rdoc_files:
|
87
|
-
|
88
|
-
- License.txt
|
86
|
+
extra_rdoc_files: []
|
87
|
+
|
89
88
|
files:
|
89
|
+
- .gitignore
|
90
|
+
- Gemfile
|
91
|
+
- Gemfile.lock
|
92
|
+
- License.txt
|
93
|
+
- README.md
|
94
|
+
- Rakefile
|
95
|
+
- jkl.gemspec
|
90
96
|
- lib/jkl.rb
|
91
97
|
- lib/jkl/calais_client.rb
|
92
|
-
- lib/jkl/rest_client.rb
|
93
98
|
- lib/jkl/rss_client.rb
|
94
99
|
- lib/jkl/text_client.rb
|
95
100
|
- test/fixtures/bbc_story.html
|
96
101
|
- test/fixtures/topix_rss.xml
|
97
102
|
- test/unit/jkl_test.rb
|
98
103
|
- test/unit/text_cleaning_test.rb
|
99
|
-
- config/config.yml
|
100
|
-
- README.md
|
101
|
-
- License.txt
|
102
104
|
has_rdoc: true
|
103
105
|
homepage: http://github.com/sshingler/jkl
|
104
106
|
licenses: []
|
105
107
|
|
106
108
|
post_install_message:
|
107
|
-
rdoc_options:
|
108
|
-
|
109
|
-
- --charset=UTF-8
|
109
|
+
rdoc_options: []
|
110
|
+
|
110
111
|
require_paths:
|
111
112
|
- lib
|
112
113
|
required_ruby_version: !ruby/object:Gem::Requirement
|
@@ -130,7 +131,10 @@ requirements: []
|
|
130
131
|
rubyforge_project:
|
131
132
|
rubygems_version: 1.3.7
|
132
133
|
signing_key:
|
133
|
-
specification_version:
|
134
|
-
summary: Jakal is a Ruby library
|
135
|
-
test_files:
|
136
|
-
|
134
|
+
specification_version: 3
|
135
|
+
summary: Jakal is a Ruby library for tagging keywords from web pages.
|
136
|
+
test_files:
|
137
|
+
- test/fixtures/bbc_story.html
|
138
|
+
- test/fixtures/topix_rss.xml
|
139
|
+
- test/unit/jkl_test.rb
|
140
|
+
- test/unit/text_cleaning_test.rb
|
data/config/config.yml
DELETED
data/lib/jkl/rest_client.rb
DELETED
@@ -1,49 +0,0 @@
|
|
1
|
-
require 'net/http'
|
2
|
-
require 'hpricot'
|
3
|
-
|
4
|
-
module Jkl
|
5
|
-
class << self
|
6
|
-
|
7
|
-
def post_to(url, post_args = {})
|
8
|
-
begin
|
9
|
-
uri = URI.parse(url)
|
10
|
-
resp, data = Net::HTTP.post_form(uri, post_args)
|
11
|
-
data
|
12
|
-
rescue URI::InvalidURIError => e
|
13
|
-
puts("WARN: JKL Invalid URI: #{e}")
|
14
|
-
rescue SocketError => e
|
15
|
-
puts("WARN: JKL Could not connect: #{e}")
|
16
|
-
rescue Errno::ECONNREFUSED => e
|
17
|
-
puts("WARN: JKL Connection refused: #{e}")
|
18
|
-
end
|
19
|
-
end
|
20
|
-
|
21
|
-
def get_from(uri)
|
22
|
-
begin
|
23
|
-
response = Net::HTTP.get_response(URI.parse(uri))
|
24
|
-
response.body
|
25
|
-
rescue URI::InvalidURIError => e
|
26
|
-
puts("WARN: JKL Invalid URI: #{e}")
|
27
|
-
rescue SocketError => e
|
28
|
-
puts("WARN: JKL Could not connect: #{e}")
|
29
|
-
rescue Errno::ECONNREFUSED => e
|
30
|
-
puts("WARN: JKL Connection refused: #{e}")
|
31
|
-
end
|
32
|
-
end
|
33
|
-
|
34
|
-
def get_from_over_https(host, path)
|
35
|
-
http = Net::HTTP.new(host, "443")
|
36
|
-
http.use_ssl = true
|
37
|
-
http.get2(path) # returns [status, data]
|
38
|
-
end
|
39
|
-
|
40
|
-
def get_xml_from(uri)
|
41
|
-
Hpricot.XML(get_from(uri))
|
42
|
-
end
|
43
|
-
|
44
|
-
def document_from(text)
|
45
|
-
Hpricot(text)
|
46
|
-
end
|
47
|
-
|
48
|
-
end
|
49
|
-
end
|