jakal 0.0.9 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/features/calais.feature +1 -1
- data/features/step_definitions/calais_steps.rb +2 -2
- data/lib/jkl.rb +40 -32
- data/lib/jkl/calais_client.rb +57 -50
- data/lib/jkl/rest_client.rb +29 -25
- data/lib/jkl/rss_client.rb +11 -7
- data/lib/jkl/url_doc_handler.rb +24 -20
- metadata +3 -5
- data/features/processing.feature +0 -16
- data/features/step_definitions/processing_steps.rb +0 -30
data/features/calais.feature
CHANGED
@@ -21,7 +21,7 @@ Feature: Calais-Specific features
|
|
21
21
|
Scenario: Get nested tags from calais
|
22
22
|
Given I have some simple text
|
23
23
|
When I request the nested entities from calais
|
24
|
-
Then I should
|
24
|
+
Then I should receive the entities grouped into categories
|
25
25
|
|
26
26
|
Scenario: Clean up blank items from a calais response
|
27
27
|
Given I have a mock calais response
|
@@ -44,9 +44,9 @@ end
|
|
44
44
|
|
45
45
|
When /^I request the nested entities from calais$/ do
|
46
46
|
key = YAML::load_file('config/keys.yml')['calais']
|
47
|
-
@response = Jkl::
|
47
|
+
@response = Jkl::tags key, @text
|
48
48
|
end
|
49
49
|
|
50
|
-
Then /^I should
|
50
|
+
Then /^I should receive the entities grouped into categories$/ do
|
51
51
|
@response.eql?({"Person"=>["Barack Obama", "Hillary Clinton"], "Position"=>["Secretary of State"]}).should == true
|
52
52
|
end
|
data/lib/jkl.rb
CHANGED
@@ -6,43 +6,51 @@ require "jkl/url_doc_handler.rb"
|
|
6
6
|
|
7
7
|
module Jkl
|
8
8
|
|
9
|
-
|
10
|
-
get_from_as_xml "#{feed}#{keyphrase}"
|
11
|
-
end
|
9
|
+
class << self
|
12
10
|
|
13
|
-
|
14
|
-
|
15
|
-
descriptions = ""
|
16
|
-
items.each do |item|
|
17
|
-
descriptions << attribute_from(item, :description).gsub("<![CDATA[","").gsub("]]>","")
|
11
|
+
def headlines(feed, keyphrase)
|
12
|
+
get_from_as_xml "#{feed}#{keyphrase}"
|
18
13
|
end
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
descriptions
|
14
|
+
|
15
|
+
def pages(headlines)
|
16
|
+
items = get_items_from headlines
|
17
|
+
descriptions = ""
|
18
|
+
items.each do |item|
|
19
|
+
descriptions << attribute_from(item, :description).gsub("<![CDATA[","").gsub("]]>","")
|
20
|
+
end
|
21
|
+
descriptions
|
27
22
|
end
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
23
|
+
|
24
|
+
def descriptions(headlines)
|
25
|
+
items = get_items_from headlines
|
26
|
+
descriptions = []
|
27
|
+
items.each do |item|
|
28
|
+
descriptions << attribute_from(item, :description).gsub("<![CDATA[","").gsub("]]>","")
|
29
|
+
end
|
30
|
+
descriptions
|
31
|
+
end
|
32
|
+
|
33
|
+
def links(headlines)
|
34
|
+
items = get_items_from headlines
|
35
|
+
links = []
|
36
|
+
items.each do |item|
|
37
|
+
links << attribute_from(item, :link)
|
38
|
+
end
|
39
|
+
links
|
36
40
|
end
|
37
|
-
links
|
38
|
-
end
|
39
41
|
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
42
|
+
def tags(key, text)
|
43
|
+
nested_list = {}
|
44
|
+
entities(key,text).each do |a|
|
45
|
+
nested_list = nested_list.merge!(a){ |key,v1,v2| v1+v2 }
|
46
|
+
end
|
47
|
+
nested_list
|
48
|
+
end
|
49
|
+
|
50
|
+
def entities(key,text)
|
51
|
+
calais_response(key, text).entities.map{|e| {e.type => [e.attributes["name"]]}}
|
52
|
+
end
|
53
|
+
|
46
54
|
end
|
47
55
|
|
48
56
|
end
|
data/lib/jkl/calais_client.rb
CHANGED
@@ -4,62 +4,69 @@ require "calais"
|
|
4
4
|
|
5
5
|
module Jkl
|
6
6
|
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
7
|
+
class << self
|
8
|
+
|
9
|
+
#using the calais gem
|
10
|
+
def calais_response(key, pages)
|
11
|
+
Calais.process_document(
|
12
|
+
:content => pages,
|
13
|
+
:content_type => :text,
|
14
|
+
:license_id => key
|
15
|
+
)
|
16
16
|
end
|
17
|
-
end
|
18
17
|
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
18
|
+
def get_from_calais(content)
|
19
|
+
begin
|
20
|
+
license_id = YAML::load_file('config/keys.yml')['calais']
|
21
|
+
c_uri = URI.parse('http://api.opencalais.com/enlighten/rest/')
|
22
|
+
post_args = { 'licenseID' => license_id, 'content' => content,
|
23
|
+
'paramsXML' => paramsXML('application/json') }
|
24
|
+
post_to(c_uri, post_args)
|
25
|
+
rescue Exception => e
|
26
|
+
puts e
|
27
|
+
end
|
27
28
|
end
|
29
|
+
|
30
|
+
def get_tag_from_json(response)
|
31
|
+
result = JSON.parse response
|
32
|
+
result.delete_if {|key, value| key == "doc" } # ditching the doc
|
33
|
+
cleaned_result = []
|
34
|
+
result.each do |key,tag|
|
35
|
+
tag = Jkl::clean_unwanted_items_from_hash tag
|
36
|
+
cleaned_result << tag
|
37
|
+
yield tag if block_given?
|
38
|
+
end
|
28
39
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
def self.get_calais_metadata(key, text)
|
33
|
-
name_value_pairs = Jkl::tags(key, text).entities.map{|e| {e.type => [e.attributes["name"]]}}
|
34
|
-
nested_list = {}
|
35
|
-
name_value_pairs.each { |a| nested_list = nested_list.merge!(a){ |key,v1,v2| v1+v2 }}
|
36
|
-
nested_list
|
37
|
-
end
|
40
|
+
cleaned_result
|
41
|
+
end
|
38
42
|
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
43
|
+
#jkl doesn't work with these aspects of the calais response, also removing blanks
|
44
|
+
def clean_unwanted_items_from_hash h
|
45
|
+
h.delete_if {|k, v| k == "relevance" }
|
46
|
+
h.delete_if {|k, v| k == "instances" }
|
47
|
+
h.delete_if {|k, v| v == "N/A"}
|
48
|
+
h.delete_if {|k, v| v == []}
|
49
|
+
h.delete_if {|k, v| v == ""}
|
50
|
+
h.delete_if {|k, v| k == "_typeGroup"}
|
51
|
+
h
|
52
|
+
end
|
49
53
|
|
50
|
-
|
54
|
+
private
|
55
|
+
|
56
|
+
def paramsXML(format)
|
57
|
+
<<-paramsXML;
|
58
|
+
<c:params xmlns:c="http://s.opencalais.com/1/pred/"
|
59
|
+
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
|
60
|
+
<c:processingDirectives
|
61
|
+
c:contentType="text/txt"
|
62
|
+
c:outputFormat="#{format}">
|
63
|
+
</c:processingDirectives>
|
64
|
+
<c:userDirectives />
|
65
|
+
<c:externalMetadata />
|
66
|
+
</c:params>
|
67
|
+
paramsXML
|
68
|
+
end
|
51
69
|
|
52
|
-
def self.paramsXML(format)
|
53
|
-
<<-paramsXML;
|
54
|
-
<c:params xmlns:c="http://s.opencalais.com/1/pred/"
|
55
|
-
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
|
56
|
-
<c:processingDirectives
|
57
|
-
c:contentType="text/txt"
|
58
|
-
c:outputFormat="#{format}">
|
59
|
-
</c:processingDirectives>
|
60
|
-
<c:userDirectives />
|
61
|
-
<c:externalMetadata />
|
62
|
-
</c:params>
|
63
|
-
paramsXML
|
64
70
|
end
|
71
|
+
|
65
72
|
end
|
data/lib/jkl/rest_client.rb
CHANGED
@@ -3,34 +3,38 @@ require 'hpricot'
|
|
3
3
|
|
4
4
|
module Jkl
|
5
5
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
6
|
+
class << self
|
7
|
+
|
8
|
+
def post_to(uri, post_args)
|
9
|
+
begin
|
10
|
+
resp, data = Net::HTTP.post_form(uri, post_args)
|
11
|
+
data
|
12
|
+
rescue URI::InvalidURIError => e
|
13
|
+
puts("WARN: Invalid URI: #{e}")
|
14
|
+
rescue SocketError => e
|
15
|
+
puts("WARN: Could not connect: #{e}")
|
16
|
+
rescue Errno::ECONNREFUSED => e
|
17
|
+
puts("WARN: Connection refused: #{e}")
|
18
|
+
end
|
16
19
|
end
|
17
|
-
end
|
18
20
|
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
21
|
+
def get_from(uri)
|
22
|
+
begin
|
23
|
+
res = Net::HTTP.get_response(URI.parse(uri))
|
24
|
+
res.body
|
25
|
+
rescue URI::InvalidURIError => e
|
26
|
+
puts("WARN: Invalid URI: #{e}")
|
27
|
+
rescue SocketError => e
|
28
|
+
puts("WARN: Could not connect: #{e}")
|
29
|
+
rescue Errno::ECONNREFUSED => e
|
30
|
+
puts("WARN: Connection refused: #{e}")
|
31
|
+
end
|
29
32
|
end
|
30
|
-
end
|
31
33
|
|
32
|
-
|
33
|
-
|
34
|
-
|
34
|
+
def get_from_as_xml(uri)
|
35
|
+
Hpricot.XML get_from uri
|
36
|
+
end
|
35
37
|
|
38
|
+
end
|
39
|
+
|
36
40
|
end
|
data/lib/jkl/rss_client.rb
CHANGED
@@ -2,14 +2,18 @@ require 'hpricot'
|
|
2
2
|
|
3
3
|
module Jkl
|
4
4
|
|
5
|
-
|
6
|
-
|
7
|
-
(rssdoc
|
8
|
-
|
9
|
-
|
5
|
+
class << self
|
6
|
+
|
7
|
+
def get_items_from(rssdoc)
|
8
|
+
items = []
|
9
|
+
(rssdoc/:item).each { |rssitem| items.push rssitem } unless rssdoc==nil
|
10
|
+
items
|
11
|
+
end
|
12
|
+
|
13
|
+
def attribute_from(item, name)
|
14
|
+
(item/name).inner_html
|
15
|
+
end
|
10
16
|
|
11
|
-
def self.attribute_from(item, name)
|
12
|
-
(item/name).inner_html
|
13
17
|
end
|
14
18
|
|
15
19
|
end
|
data/lib/jkl/url_doc_handler.rb
CHANGED
@@ -3,29 +3,33 @@ require 'rest_client'
|
|
3
3
|
|
4
4
|
module Jkl
|
5
5
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
6
|
+
class << self
|
7
|
+
|
8
|
+
def sanitize(text)
|
9
|
+
str = ""
|
10
|
+
text = text.to_s.gsub(/((<[\s\/]*script\b[^>]*>)([^>]*)(<\/script>))/i,"") #remove script tags - with contents
|
11
|
+
text.to_s.gsub(/<\/?[^>]*>/, "").split("\r").each do |l| # remove all tags
|
12
|
+
l = l.gsub(/^[ \t]/,"") #remove tabs
|
13
|
+
l = l.gsub(/^[ \s]/,"")
|
14
|
+
l.split("\n").each do |l|
|
15
|
+
str << l unless l.count(" ") < 5 # remove short lines - ususally just navigation
|
16
|
+
end
|
14
17
|
end
|
18
|
+
str
|
15
19
|
end
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
puts("WARN: Connection refused: #{e}")
|
20
|
+
|
21
|
+
def from_doc(response)
|
22
|
+
begin
|
23
|
+
Hpricot(response)
|
24
|
+
rescue URI::InvalidURIError => e
|
25
|
+
puts("WARN: Problem with getting a connection: #{e}")
|
26
|
+
rescue SocketError => e
|
27
|
+
puts("WARN: Could not connect to feed: #{e}")
|
28
|
+
rescue Errno::ECONNREFUSED => e
|
29
|
+
puts("WARN: Connection refused: #{e}")
|
30
|
+
end
|
28
31
|
end
|
32
|
+
|
29
33
|
end
|
30
34
|
|
31
35
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: jakal
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- sshingler
|
@@ -13,7 +13,7 @@ date: 2009-08-27 00:00:00 +01:00
|
|
13
13
|
default_executable:
|
14
14
|
dependencies: []
|
15
15
|
|
16
|
-
description: Jakal is a Ruby library for
|
16
|
+
description: Jakal is a Ruby library which contains some utilies for connecting to internet based APIs.
|
17
17
|
email: "'shingler@gmail.com'"
|
18
18
|
executables: []
|
19
19
|
|
@@ -30,7 +30,6 @@ files:
|
|
30
30
|
- lib/jkl/url_doc_handler.rb
|
31
31
|
- features/calais.feature
|
32
32
|
- features/http.feature
|
33
|
-
- features/processing.feature
|
34
33
|
- features/sanitize-text.feature
|
35
34
|
- features/mocks/bbc_story.html
|
36
35
|
- features/mocks/calais.json
|
@@ -38,7 +37,6 @@ files:
|
|
38
37
|
- features/mocks/twitter.json
|
39
38
|
- features/step_definitions/calais_steps.rb
|
40
39
|
- features/step_definitions/http_steps.rb
|
41
|
-
- features/step_definitions/processing_steps.rb
|
42
40
|
- features/step_definitions/require_steps.rb
|
43
41
|
- features/step_definitions/sanitize-text_steps.rb
|
44
42
|
- features/step_definitions/twitter_steps.rb
|
@@ -73,6 +71,6 @@ rubyforge_project:
|
|
73
71
|
rubygems_version: 1.3.5
|
74
72
|
signing_key:
|
75
73
|
specification_version: 3
|
76
|
-
summary: Jakal is a Ruby library for
|
74
|
+
summary: Jakal is a Ruby library which contains some utilies for connecting to internet based APIs.
|
77
75
|
test_files: []
|
78
76
|
|
data/features/processing.feature
DELETED
@@ -1,16 +0,0 @@
|
|
1
|
-
Feature: Processing features
|
2
|
-
In order to integrate our apps
|
3
|
-
As a developer
|
4
|
-
I want to make some requests and inspect some responses
|
5
|
-
|
6
|
-
@connection_needed
|
7
|
-
Scenario: end to end flow, keyphrase to tags
|
8
|
-
Given I have a keyphrase 'london restaurants'
|
9
|
-
When I request some RSS
|
10
|
-
Then I should receive some headlines
|
11
|
-
And I should be able to get the copy from the first headline
|
12
|
-
When I post to calais
|
13
|
-
Then I should receive some tags
|
14
|
-
And I should be able to persist these tags
|
15
|
-
When I generate a view of the recent keyword results
|
16
|
-
Then I should see a network graph
|
@@ -1,30 +0,0 @@
|
|
1
|
-
|
2
|
-
############### pending steps below ################
|
3
|
-
|
4
|
-
|
5
|
-
When /^I request tags for the first story$/ do
|
6
|
-
r = Jkl::get_from_calais @story
|
7
|
-
Jkl::get_tag_from_json(get_from_calais(@story)) do |tag|
|
8
|
-
tag.each{|k,v| puts "#{k} : #{v}"}
|
9
|
-
end
|
10
|
-
end
|
11
|
-
|
12
|
-
When /^I request stories from Topix$/ do
|
13
|
-
search_term = 'london'
|
14
|
-
url = "#{YAML::load_file('config/config.yml')['topix']}#{search_term}"
|
15
|
-
@response = Jkl::get_from_as_xml url
|
16
|
-
end
|
17
|
-
|
18
|
-
When /^I get some news stories from the first keyword$/ do
|
19
|
-
search_term = @trend['name'].gsub('#','') #removing hash from start of trend name
|
20
|
-
search_term = 'london'
|
21
|
-
url = "#{YAML::load_file('config/config.yml')['topix']}#{search_term}"
|
22
|
-
rss_response = Jkl::get_from_as_xml url
|
23
|
-
items = Jkl::get_items_from rss_response
|
24
|
-
links = []
|
25
|
-
items.each do |item|
|
26
|
-
links << attribute_from(item, :link)
|
27
|
-
end
|
28
|
-
@story = Jkl::sanitize Jkl::from_doc Jkl::get_from links[0]
|
29
|
-
end
|
30
|
-
|