jakal 0.0.9 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/features/calais.feature +1 -1
- data/features/step_definitions/calais_steps.rb +2 -2
- data/lib/jkl.rb +40 -32
- data/lib/jkl/calais_client.rb +57 -50
- data/lib/jkl/rest_client.rb +29 -25
- data/lib/jkl/rss_client.rb +11 -7
- data/lib/jkl/url_doc_handler.rb +24 -20
- metadata +3 -5
- data/features/processing.feature +0 -16
- data/features/step_definitions/processing_steps.rb +0 -30
data/features/calais.feature
CHANGED
@@ -21,7 +21,7 @@ Feature: Calais-Specific features
|
|
21
21
|
Scenario: Get nested tags from calais
|
22
22
|
Given I have some simple text
|
23
23
|
When I request the nested entities from calais
|
24
|
-
Then I should
|
24
|
+
Then I should receive the entities grouped into categories
|
25
25
|
|
26
26
|
Scenario: Clean up blank items from a calais response
|
27
27
|
Given I have a mock calais response
|
@@ -44,9 +44,9 @@ end
|
|
44
44
|
|
45
45
|
When /^I request the nested entities from calais$/ do
|
46
46
|
key = YAML::load_file('config/keys.yml')['calais']
|
47
|
-
@response = Jkl::
|
47
|
+
@response = Jkl::tags key, @text
|
48
48
|
end
|
49
49
|
|
50
|
-
Then /^I should
|
50
|
+
Then /^I should receive the entities grouped into categories$/ do
|
51
51
|
@response.eql?({"Person"=>["Barack Obama", "Hillary Clinton"], "Position"=>["Secretary of State"]}).should == true
|
52
52
|
end
|
data/lib/jkl.rb
CHANGED
@@ -6,43 +6,51 @@ require "jkl/url_doc_handler.rb"
|
|
6
6
|
|
7
7
|
module Jkl
|
8
8
|
|
9
|
-
|
10
|
-
get_from_as_xml "#{feed}#{keyphrase}"
|
11
|
-
end
|
9
|
+
class << self
|
12
10
|
|
13
|
-
|
14
|
-
|
15
|
-
descriptions = ""
|
16
|
-
items.each do |item|
|
17
|
-
descriptions << attribute_from(item, :description).gsub("<![CDATA[","").gsub("]]>","")
|
11
|
+
def headlines(feed, keyphrase)
|
12
|
+
get_from_as_xml "#{feed}#{keyphrase}"
|
18
13
|
end
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
descriptions
|
14
|
+
|
15
|
+
def pages(headlines)
|
16
|
+
items = get_items_from headlines
|
17
|
+
descriptions = ""
|
18
|
+
items.each do |item|
|
19
|
+
descriptions << attribute_from(item, :description).gsub("<![CDATA[","").gsub("]]>","")
|
20
|
+
end
|
21
|
+
descriptions
|
27
22
|
end
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
23
|
+
|
24
|
+
def descriptions(headlines)
|
25
|
+
items = get_items_from headlines
|
26
|
+
descriptions = []
|
27
|
+
items.each do |item|
|
28
|
+
descriptions << attribute_from(item, :description).gsub("<![CDATA[","").gsub("]]>","")
|
29
|
+
end
|
30
|
+
descriptions
|
31
|
+
end
|
32
|
+
|
33
|
+
def links(headlines)
|
34
|
+
items = get_items_from headlines
|
35
|
+
links = []
|
36
|
+
items.each do |item|
|
37
|
+
links << attribute_from(item, :link)
|
38
|
+
end
|
39
|
+
links
|
36
40
|
end
|
37
|
-
links
|
38
|
-
end
|
39
41
|
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
42
|
+
def tags(key, text)
|
43
|
+
nested_list = {}
|
44
|
+
entities(key,text).each do |a|
|
45
|
+
nested_list = nested_list.merge!(a){ |key,v1,v2| v1+v2 }
|
46
|
+
end
|
47
|
+
nested_list
|
48
|
+
end
|
49
|
+
|
50
|
+
def entities(key,text)
|
51
|
+
calais_response(key, text).entities.map{|e| {e.type => [e.attributes["name"]]}}
|
52
|
+
end
|
53
|
+
|
46
54
|
end
|
47
55
|
|
48
56
|
end
|
data/lib/jkl/calais_client.rb
CHANGED
@@ -4,62 +4,69 @@ require "calais"
|
|
4
4
|
|
5
5
|
module Jkl
|
6
6
|
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
7
|
+
class << self
|
8
|
+
|
9
|
+
#using the calais gem
|
10
|
+
def calais_response(key, pages)
|
11
|
+
Calais.process_document(
|
12
|
+
:content => pages,
|
13
|
+
:content_type => :text,
|
14
|
+
:license_id => key
|
15
|
+
)
|
16
16
|
end
|
17
|
-
end
|
18
17
|
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
18
|
+
def get_from_calais(content)
|
19
|
+
begin
|
20
|
+
license_id = YAML::load_file('config/keys.yml')['calais']
|
21
|
+
c_uri = URI.parse('http://api.opencalais.com/enlighten/rest/')
|
22
|
+
post_args = { 'licenseID' => license_id, 'content' => content,
|
23
|
+
'paramsXML' => paramsXML('application/json') }
|
24
|
+
post_to(c_uri, post_args)
|
25
|
+
rescue Exception => e
|
26
|
+
puts e
|
27
|
+
end
|
27
28
|
end
|
29
|
+
|
30
|
+
def get_tag_from_json(response)
|
31
|
+
result = JSON.parse response
|
32
|
+
result.delete_if {|key, value| key == "doc" } # ditching the doc
|
33
|
+
cleaned_result = []
|
34
|
+
result.each do |key,tag|
|
35
|
+
tag = Jkl::clean_unwanted_items_from_hash tag
|
36
|
+
cleaned_result << tag
|
37
|
+
yield tag if block_given?
|
38
|
+
end
|
28
39
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
def self.get_calais_metadata(key, text)
|
33
|
-
name_value_pairs = Jkl::tags(key, text).entities.map{|e| {e.type => [e.attributes["name"]]}}
|
34
|
-
nested_list = {}
|
35
|
-
name_value_pairs.each { |a| nested_list = nested_list.merge!(a){ |key,v1,v2| v1+v2 }}
|
36
|
-
nested_list
|
37
|
-
end
|
40
|
+
cleaned_result
|
41
|
+
end
|
38
42
|
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
43
|
+
#jkl doesn't work with these aspects of the calais response, also removing blanks
|
44
|
+
def clean_unwanted_items_from_hash h
|
45
|
+
h.delete_if {|k, v| k == "relevance" }
|
46
|
+
h.delete_if {|k, v| k == "instances" }
|
47
|
+
h.delete_if {|k, v| v == "N/A"}
|
48
|
+
h.delete_if {|k, v| v == []}
|
49
|
+
h.delete_if {|k, v| v == ""}
|
50
|
+
h.delete_if {|k, v| k == "_typeGroup"}
|
51
|
+
h
|
52
|
+
end
|
49
53
|
|
50
|
-
|
54
|
+
private
|
55
|
+
|
56
|
+
def paramsXML(format)
|
57
|
+
<<-paramsXML;
|
58
|
+
<c:params xmlns:c="http://s.opencalais.com/1/pred/"
|
59
|
+
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
|
60
|
+
<c:processingDirectives
|
61
|
+
c:contentType="text/txt"
|
62
|
+
c:outputFormat="#{format}">
|
63
|
+
</c:processingDirectives>
|
64
|
+
<c:userDirectives />
|
65
|
+
<c:externalMetadata />
|
66
|
+
</c:params>
|
67
|
+
paramsXML
|
68
|
+
end
|
51
69
|
|
52
|
-
def self.paramsXML(format)
|
53
|
-
<<-paramsXML;
|
54
|
-
<c:params xmlns:c="http://s.opencalais.com/1/pred/"
|
55
|
-
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
|
56
|
-
<c:processingDirectives
|
57
|
-
c:contentType="text/txt"
|
58
|
-
c:outputFormat="#{format}">
|
59
|
-
</c:processingDirectives>
|
60
|
-
<c:userDirectives />
|
61
|
-
<c:externalMetadata />
|
62
|
-
</c:params>
|
63
|
-
paramsXML
|
64
70
|
end
|
71
|
+
|
65
72
|
end
|
data/lib/jkl/rest_client.rb
CHANGED
@@ -3,34 +3,38 @@ require 'hpricot'
|
|
3
3
|
|
4
4
|
module Jkl
|
5
5
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
6
|
+
class << self
|
7
|
+
|
8
|
+
def post_to(uri, post_args)
|
9
|
+
begin
|
10
|
+
resp, data = Net::HTTP.post_form(uri, post_args)
|
11
|
+
data
|
12
|
+
rescue URI::InvalidURIError => e
|
13
|
+
puts("WARN: Invalid URI: #{e}")
|
14
|
+
rescue SocketError => e
|
15
|
+
puts("WARN: Could not connect: #{e}")
|
16
|
+
rescue Errno::ECONNREFUSED => e
|
17
|
+
puts("WARN: Connection refused: #{e}")
|
18
|
+
end
|
16
19
|
end
|
17
|
-
end
|
18
20
|
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
21
|
+
def get_from(uri)
|
22
|
+
begin
|
23
|
+
res = Net::HTTP.get_response(URI.parse(uri))
|
24
|
+
res.body
|
25
|
+
rescue URI::InvalidURIError => e
|
26
|
+
puts("WARN: Invalid URI: #{e}")
|
27
|
+
rescue SocketError => e
|
28
|
+
puts("WARN: Could not connect: #{e}")
|
29
|
+
rescue Errno::ECONNREFUSED => e
|
30
|
+
puts("WARN: Connection refused: #{e}")
|
31
|
+
end
|
29
32
|
end
|
30
|
-
end
|
31
33
|
|
32
|
-
|
33
|
-
|
34
|
-
|
34
|
+
def get_from_as_xml(uri)
|
35
|
+
Hpricot.XML get_from uri
|
36
|
+
end
|
35
37
|
|
38
|
+
end
|
39
|
+
|
36
40
|
end
|
data/lib/jkl/rss_client.rb
CHANGED
@@ -2,14 +2,18 @@ require 'hpricot'
|
|
2
2
|
|
3
3
|
module Jkl
|
4
4
|
|
5
|
-
|
6
|
-
|
7
|
-
(rssdoc
|
8
|
-
|
9
|
-
|
5
|
+
class << self
|
6
|
+
|
7
|
+
def get_items_from(rssdoc)
|
8
|
+
items = []
|
9
|
+
(rssdoc/:item).each { |rssitem| items.push rssitem } unless rssdoc==nil
|
10
|
+
items
|
11
|
+
end
|
12
|
+
|
13
|
+
def attribute_from(item, name)
|
14
|
+
(item/name).inner_html
|
15
|
+
end
|
10
16
|
|
11
|
-
def self.attribute_from(item, name)
|
12
|
-
(item/name).inner_html
|
13
17
|
end
|
14
18
|
|
15
19
|
end
|
data/lib/jkl/url_doc_handler.rb
CHANGED
@@ -3,29 +3,33 @@ require 'rest_client'
|
|
3
3
|
|
4
4
|
module Jkl
|
5
5
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
6
|
+
class << self
|
7
|
+
|
8
|
+
def sanitize(text)
|
9
|
+
str = ""
|
10
|
+
text = text.to_s.gsub(/((<[\s\/]*script\b[^>]*>)([^>]*)(<\/script>))/i,"") #remove script tags - with contents
|
11
|
+
text.to_s.gsub(/<\/?[^>]*>/, "").split("\r").each do |l| # remove all tags
|
12
|
+
l = l.gsub(/^[ \t]/,"") #remove tabs
|
13
|
+
l = l.gsub(/^[ \s]/,"")
|
14
|
+
l.split("\n").each do |l|
|
15
|
+
str << l unless l.count(" ") < 5 # remove short lines - ususally just navigation
|
16
|
+
end
|
14
17
|
end
|
18
|
+
str
|
15
19
|
end
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
puts("WARN: Connection refused: #{e}")
|
20
|
+
|
21
|
+
def from_doc(response)
|
22
|
+
begin
|
23
|
+
Hpricot(response)
|
24
|
+
rescue URI::InvalidURIError => e
|
25
|
+
puts("WARN: Problem with getting a connection: #{e}")
|
26
|
+
rescue SocketError => e
|
27
|
+
puts("WARN: Could not connect to feed: #{e}")
|
28
|
+
rescue Errno::ECONNREFUSED => e
|
29
|
+
puts("WARN: Connection refused: #{e}")
|
30
|
+
end
|
28
31
|
end
|
32
|
+
|
29
33
|
end
|
30
34
|
|
31
35
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: jakal
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- sshingler
|
@@ -13,7 +13,7 @@ date: 2009-08-27 00:00:00 +01:00
|
|
13
13
|
default_executable:
|
14
14
|
dependencies: []
|
15
15
|
|
16
|
-
description: Jakal is a Ruby library for
|
16
|
+
description: Jakal is a Ruby library which contains some utilies for connecting to internet based APIs.
|
17
17
|
email: "'shingler@gmail.com'"
|
18
18
|
executables: []
|
19
19
|
|
@@ -30,7 +30,6 @@ files:
|
|
30
30
|
- lib/jkl/url_doc_handler.rb
|
31
31
|
- features/calais.feature
|
32
32
|
- features/http.feature
|
33
|
-
- features/processing.feature
|
34
33
|
- features/sanitize-text.feature
|
35
34
|
- features/mocks/bbc_story.html
|
36
35
|
- features/mocks/calais.json
|
@@ -38,7 +37,6 @@ files:
|
|
38
37
|
- features/mocks/twitter.json
|
39
38
|
- features/step_definitions/calais_steps.rb
|
40
39
|
- features/step_definitions/http_steps.rb
|
41
|
-
- features/step_definitions/processing_steps.rb
|
42
40
|
- features/step_definitions/require_steps.rb
|
43
41
|
- features/step_definitions/sanitize-text_steps.rb
|
44
42
|
- features/step_definitions/twitter_steps.rb
|
@@ -73,6 +71,6 @@ rubyforge_project:
|
|
73
71
|
rubygems_version: 1.3.5
|
74
72
|
signing_key:
|
75
73
|
specification_version: 3
|
76
|
-
summary: Jakal is a Ruby library for
|
74
|
+
summary: Jakal is a Ruby library which contains some utilies for connecting to internet based APIs.
|
77
75
|
test_files: []
|
78
76
|
|
data/features/processing.feature
DELETED
@@ -1,16 +0,0 @@
|
|
1
|
-
Feature: Processing features
|
2
|
-
In order to integrate our apps
|
3
|
-
As a developer
|
4
|
-
I want to make some requests and inspect some responses
|
5
|
-
|
6
|
-
@connection_needed
|
7
|
-
Scenario: end to end flow, keyphrase to tags
|
8
|
-
Given I have a keyphrase 'london restaurants'
|
9
|
-
When I request some RSS
|
10
|
-
Then I should receive some headlines
|
11
|
-
And I should be able to get the copy from the first headline
|
12
|
-
When I post to calais
|
13
|
-
Then I should receive some tags
|
14
|
-
And I should be able to persist these tags
|
15
|
-
When I generate a view of the recent keyword results
|
16
|
-
Then I should see a network graph
|
@@ -1,30 +0,0 @@
|
|
1
|
-
|
2
|
-
############### pending steps below ################
|
3
|
-
|
4
|
-
|
5
|
-
When /^I request tags for the first story$/ do
|
6
|
-
r = Jkl::get_from_calais @story
|
7
|
-
Jkl::get_tag_from_json(get_from_calais(@story)) do |tag|
|
8
|
-
tag.each{|k,v| puts "#{k} : #{v}"}
|
9
|
-
end
|
10
|
-
end
|
11
|
-
|
12
|
-
When /^I request stories from Topix$/ do
|
13
|
-
search_term = 'london'
|
14
|
-
url = "#{YAML::load_file('config/config.yml')['topix']}#{search_term}"
|
15
|
-
@response = Jkl::get_from_as_xml url
|
16
|
-
end
|
17
|
-
|
18
|
-
When /^I get some news stories from the first keyword$/ do
|
19
|
-
search_term = @trend['name'].gsub('#','') #removing hash from start of trend name
|
20
|
-
search_term = 'london'
|
21
|
-
url = "#{YAML::load_file('config/config.yml')['topix']}#{search_term}"
|
22
|
-
rss_response = Jkl::get_from_as_xml url
|
23
|
-
items = Jkl::get_items_from rss_response
|
24
|
-
links = []
|
25
|
-
items.each do |item|
|
26
|
-
links << attribute_from(item, :link)
|
27
|
-
end
|
28
|
-
@story = Jkl::sanitize Jkl::from_doc Jkl::get_from links[0]
|
29
|
-
end
|
30
|
-
|