jakal 0.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/License.txt +22 -0
- data/README.rdoc +28 -0
- data/features/calais.feature +38 -0
- data/features/http.feature +32 -0
- data/features/mocks/bbc_story.html +2863 -0
- data/features/mocks/calais.json +2464 -0
- data/features/mocks/topix_rss.xml +47 -0
- data/features/mocks/twitter.json +11 -0
- data/features/processing.feature +16 -0
- data/features/sanitize-text.feature +53 -0
- data/features/step_definitions/calais_steps.rb +44 -0
- data/features/step_definitions/http_steps.rb +56 -0
- data/features/step_definitions/processing_steps.rb +30 -0
- data/features/step_definitions/require_steps.rb +12 -0
- data/features/step_definitions/sanitize-text_steps.rb +32 -0
- data/features/step_definitions/twitter_steps.rb +17 -0
- data/features/support/env.rb +10 -0
- data/lib/jkl.rb +48 -0
- data/lib/jkl/calais_client.rb +64 -0
- data/lib/jkl/rest_client.rb +36 -0
- data/lib/jkl/rss_client.rb +15 -0
- data/lib/jkl/url_doc_handler.rb +31 -0
- metadata +78 -0
@@ -0,0 +1,47 @@
|
|
1
|
+
<?xml version="1.0" encoding="ISO-8859-1"?>
|
2
|
+
<?xml-stylesheet href="/static/rss.3.xsl" type="text/xsl"?>
|
3
|
+
<rss xmlns:topix="http://www.topix.com/partners/rsscomment/" xmlns:georss="http://www.georss.org/georss" version="2.0">
|
4
|
+
<channel>
|
5
|
+
<title>Search for "london" </title>
|
6
|
+
<link>http://www.topix.com/search/article?q=london&x=0&y=0</link>
|
7
|
+
<topix:rsslink>http://www.topix.com/rss/search/article.xml?q=london&x=0&y=0</topix:rsslink>
|
8
|
+
<description>News continually updated from thousands of sources across the web</description>
|
9
|
+
<language>en-us</language>
|
10
|
+
<ttl>240</ttl>
|
11
|
+
<copyright>Copyright 2008, Topix</copyright>
|
12
|
+
<image>
|
13
|
+
<title>Topix</title>
|
14
|
+
<link>http://www.topix.com/</link>
|
15
|
+
<url>http://topix.cachefly.net/pics/topix_homepage_logo2.png</url>
|
16
|
+
</image>
|
17
|
+
<item>
|
18
|
+
<title>Major Michael Jackson tribute planned for Vienna</title>
|
19
|
+
<link>http://www.localnews8.com/Global/story.asp?S=10876507</link>
|
20
|
+
<description><![CDATA[The King of Pop will get a royal send-off next month in Vienna. Events promoter World Awards Media GmbH confirmed Monday that members of Michael Jackson's family and a "high-profile lineup of international stars" are planning a tribute concert in the Austrian capital.]]></description>
|
21
|
+
<source>KIFI</source>
|
22
|
+
<pubDate>Mon, 10 Aug 2009 15:21:58 GMT</pubDate>
|
23
|
+
<category>Jermaine Jackson</category>
|
24
|
+
<category>Michael Jackson</category>
|
25
|
+
<category>Pop/Rock</category>
|
26
|
+
<category>Black Entertainment</category>
|
27
|
+
<category>R-N-B</category>
|
28
|
+
<guid isPermaLink="false">C39RO2C8Q8NQR825</guid>
|
29
|
+
</item>
|
30
|
+
<item>
|
31
|
+
<title>LATEST: Man stabbed to death with machete in pub garden</title>
|
32
|
+
<link>http://www.thisislondon.co.uk/standard/article-23730262-details/LATEST%3A+Man+stabbed+to+death+with+machete+in+pub+garden/article.do</link>
|
33
|
+
<description><![CDATA[A man drinking with his girlfriend in a beer garden was hacked to death with machetes in front of horrified drinkers.]]></description>
|
34
|
+
<source>This is London</source>
|
35
|
+
<pubDate>Mon, 10 Aug 2009 15:21:51 GMT</pubDate>
|
36
|
+
<category>Greater London County, England</category>
|
37
|
+
<category>Cheshire County, England</category>
|
38
|
+
<category>England, United Kingdom</category>
|
39
|
+
<category>United Kingdom</category>
|
40
|
+
<category>Sutton, England</category>
|
41
|
+
<category>London, England</category>
|
42
|
+
<category>World News</category>
|
43
|
+
<category>Essex County, England</category>
|
44
|
+
<guid isPermaLink="false">KN03NCSH3KIVG416</guid>
|
45
|
+
</item>
|
46
|
+
</channel>
|
47
|
+
</rss>
|
@@ -0,0 +1,11 @@
|
|
1
|
+
{
|
2
|
+
"trends": [{
|
3
|
+
"name": "musicmonday",
|
4
|
+
"url": "http:\/\/search.twitter.com\/search?q=%23musicmonday"
|
5
|
+
},
|
6
|
+
{
|
7
|
+
"name": "GI Joe",
|
8
|
+
"url": "http:\/\/search.twitter.com\/search?q=%22GI+Joe%22+OR+Joe"
|
9
|
+
}],
|
10
|
+
"as_of": "Mon, 10 Aug 2009 15:04:54 +0000"
|
11
|
+
}
|
@@ -0,0 +1,16 @@
|
|
1
|
+
Feature: Processing features
|
2
|
+
In order to integrate our apps
|
3
|
+
As a developer
|
4
|
+
I want to make some requests and inspect some responses
|
5
|
+
|
6
|
+
@connection_needed
|
7
|
+
Scenario: end to end flow, keyphrase to tags
|
8
|
+
Given I have a keyphrase 'london restaurants'
|
9
|
+
When I request some RSS
|
10
|
+
Then I should receive some headlines
|
11
|
+
And I should be able to get the copy from the first headline
|
12
|
+
When I post to calais
|
13
|
+
Then I should receive some tags
|
14
|
+
And I should be able to persist these tags
|
15
|
+
When I generate a view of the recent keyword results
|
16
|
+
Then I should see a network graph
|
@@ -0,0 +1,53 @@
|
|
1
|
+
Feature: Processing features
|
2
|
+
In order to integrate our apps
|
3
|
+
As a developer
|
4
|
+
I want to make some requests and inspect some responses
|
5
|
+
|
6
|
+
@unit @text
|
7
|
+
Scenario: Sanitize some ok text
|
8
|
+
Given I have a keyphrase 'the cat sat on the mat'
|
9
|
+
When I sanitize this text
|
10
|
+
Then it should be ok
|
11
|
+
And it should say 'the cat sat on the mat'
|
12
|
+
|
13
|
+
@unit @text
|
14
|
+
Scenario: Sanitize some short text
|
15
|
+
Given I have a keyphrase 'the cat sat'
|
16
|
+
When I sanitize this text
|
17
|
+
Then it should say ''
|
18
|
+
|
19
|
+
@unit @text @wip
|
20
|
+
Scenario: Sanitize some text with tabs and spaces
|
21
|
+
Given I have a keyphrase 'the cat sat on the mat '
|
22
|
+
When I sanitize this text
|
23
|
+
Then it should say 'the cat sat on the mat'
|
24
|
+
|
25
|
+
@unit @text @wip
|
26
|
+
Scenario: Sanitize some short text with tabs and spaces
|
27
|
+
Given I have a keyphrase 'the cat sat on '
|
28
|
+
When I sanitize this text
|
29
|
+
Then it should say ''
|
30
|
+
|
31
|
+
@unit @text
|
32
|
+
Scenario: Sanitize some tagged short text
|
33
|
+
Given I have a keyphrase '<a href="a-link.html>the cat sat</a>'
|
34
|
+
When I sanitize this text
|
35
|
+
Then it should say ''
|
36
|
+
|
37
|
+
@unit @text
|
38
|
+
Scenario: Sanitize some tagged text
|
39
|
+
Given I have a keyphrase '<a href="a-link.html>the cat sat on the mat</a>'
|
40
|
+
When I sanitize this text
|
41
|
+
Then it should be ok
|
42
|
+
Then it should say 'the cat sat on the mat'
|
43
|
+
|
44
|
+
@unit @text @wip
|
45
|
+
Scenario: Remove script tags
|
46
|
+
Given I have some script tag data
|
47
|
+
When I sanitize this text
|
48
|
+
Then it should say ' some para stuff here '
|
49
|
+
|
50
|
+
Scenario: Clean a web page
|
51
|
+
Given I have a sample BBC story
|
52
|
+
When I sanitize this text
|
53
|
+
Then it should be ok
|
@@ -0,0 +1,44 @@
|
|
1
|
+
|
2
|
+
Given /^I have some simple text$/ do
|
3
|
+
@text = "Barack Obama said today that he expects there to be conflict within his new security team after confirming Hillary Clinton as his choice for US Secretary of State."
|
4
|
+
end
|
5
|
+
|
6
|
+
Given /^I have a sanitized sample BBC story$/ do
|
7
|
+
Given "I have a sample BBC story"
|
8
|
+
When "I sanitize this text"
|
9
|
+
end
|
10
|
+
|
11
|
+
Given /^I have a mock calais response$/ do
|
12
|
+
@response = File.open('features/mocks/calais.json','r') {|f| f.readlines.to_s}
|
13
|
+
end
|
14
|
+
|
15
|
+
When /^I post to calais$/ do
|
16
|
+
@response = Jkl::get_from_calais @text
|
17
|
+
end
|
18
|
+
|
19
|
+
When /^I remove the unwanted items$/ do
|
20
|
+
@processed_json = Jkl::clean_unwanted_items_from_hash(JSON.parse(@response))
|
21
|
+
end
|
22
|
+
|
23
|
+
Then /^there should no longer be any "([^\"]*)"$/ do |arg1|
|
24
|
+
@processed_json[arg1].should be_nil
|
25
|
+
end
|
26
|
+
|
27
|
+
Then /^I should receive some tags$/ do
|
28
|
+
Jkl::get_tag_from_json(@response) do |tag|
|
29
|
+
tag.should_not be_nil
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
Then /^there should be some "([^\"]*)" tags$/ do |arg1|
|
34
|
+
Jkl::get_tag_from_json(@response) {|tag|
|
35
|
+
#puts tag.inspect
|
36
|
+
tag.each{|k,v| puts "#{k} : #{v}" if k=='_type'}
|
37
|
+
}
|
38
|
+
end
|
39
|
+
|
40
|
+
Then /^I should be able to see the whole lot of tags as one block$/ do
|
41
|
+
tags = Jkl::get_tag_from_json(@response)
|
42
|
+
tags.length.should > 0
|
43
|
+
end
|
44
|
+
|
@@ -0,0 +1,56 @@
|
|
1
|
+
When /^I post some data to yahoo$/ do
|
2
|
+
@url = URI.parse('http://search.yahooapis.com/ContentAnalysisService/V1/termExtraction')
|
3
|
+
appid = LICENSE_ID = YAML::load_file('config/keys.yml')['yahoo']
|
4
|
+
context = URI.encode('Italian sculptors and painters of the renaissance favored the Virgin Mary for inspiration')
|
5
|
+
post_args = { 'appid' => appid, 'context' => context, 'output' => 'json' }
|
6
|
+
@response = Jkl::post_to @url, post_args
|
7
|
+
end
|
8
|
+
|
9
|
+
When /^I request some RSS$/ do
|
10
|
+
keyphrase = @keyphrase || "iraq"
|
11
|
+
url = "#{YAML::load_file('config/config.yml')['topix']}#{CGI::escape(keyphrase)}"
|
12
|
+
@response = Jkl::get_from_as_xml url
|
13
|
+
end
|
14
|
+
|
15
|
+
Given /^I have some RSS$/ do
|
16
|
+
raw = File.open('features/mocks/topix_rss.xml','r') {|f| f.readlines.to_s}
|
17
|
+
@response = Hpricot.XML raw
|
18
|
+
end
|
19
|
+
|
20
|
+
When /^I make a restful get request$/ do
|
21
|
+
url = "http://news.bbc.co.uk/1/hi/uk_politics/7677419.stm"
|
22
|
+
@response = Jkl::get_from url
|
23
|
+
end
|
24
|
+
|
25
|
+
When /^I request some trends$/ do
|
26
|
+
twitter_json_url = YAML::load_file('config/config.yml')['twitter']
|
27
|
+
output = JSON.parse Jkl::get_from twitter_json_url
|
28
|
+
@response = output['trends']
|
29
|
+
end
|
30
|
+
|
31
|
+
|
32
|
+
Then /^I should get a response$/ do
|
33
|
+
@response.should_not == nil
|
34
|
+
#puts @response.inspect
|
35
|
+
end
|
36
|
+
|
37
|
+
Then /^I should receive some headlines$/ do
|
38
|
+
@items = Jkl::get_items_from @response
|
39
|
+
@links = []
|
40
|
+
@items.each do |item|
|
41
|
+
@links << Jkl::attribute_from(item, :link)
|
42
|
+
end
|
43
|
+
@links.should_not == nil
|
44
|
+
@links.length.should > 0
|
45
|
+
end
|
46
|
+
|
47
|
+
Then /^I should be able to get the copy from the first headline$/ do
|
48
|
+
@response = Jkl::get_from @links[0]
|
49
|
+
@response.should_not be_nil
|
50
|
+
@response.should_not == ""
|
51
|
+
@text = Jkl::sanitize @response
|
52
|
+
end
|
53
|
+
|
54
|
+
Then /^I should see some text$/ do
|
55
|
+
@response.length.should > 0
|
56
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
|
2
|
+
############### pending steps below ################
|
3
|
+
|
4
|
+
|
5
|
+
When /^I request tags for the first story$/ do
|
6
|
+
r = Jkl::get_from_calais @story
|
7
|
+
Jkl::get_tag_from_json(get_from_calais(@story)) do |tag|
|
8
|
+
tag.each{|k,v| puts "#{k} : #{v}"}
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
When /^I request stories from Topix$/ do
|
13
|
+
search_term = 'london'
|
14
|
+
url = "#{YAML::load_file('config/config.yml')['topix']}#{search_term}"
|
15
|
+
@response = Jkl::get_from_as_xml url
|
16
|
+
end
|
17
|
+
|
18
|
+
When /^I get some news stories from the first keyword$/ do
|
19
|
+
search_term = @trend['name'].gsub('#','') #removing hash from start of trend name
|
20
|
+
search_term = 'london'
|
21
|
+
url = "#{YAML::load_file('config/config.yml')['topix']}#{search_term}"
|
22
|
+
rss_response = Jkl::get_from_as_xml url
|
23
|
+
items = Jkl::get_items_from rss_response
|
24
|
+
links = []
|
25
|
+
items.each do |item|
|
26
|
+
links << attribute_from(item, :link)
|
27
|
+
end
|
28
|
+
@story = Jkl::sanitize Jkl::from_doc Jkl::get_from links[0]
|
29
|
+
end
|
30
|
+
|
@@ -0,0 +1,12 @@
|
|
1
|
+
require 'hpricot'
|
2
|
+
require 'json'
|
3
|
+
require 'restclient'
|
4
|
+
require 'haml'
|
5
|
+
require 'cgi'
|
6
|
+
require 'lib/jkl.rb'
|
7
|
+
require 'lib/jkl/calais_client.rb'
|
8
|
+
require 'lib/jkl/rest_client.rb'
|
9
|
+
require 'lib/jkl/rss_client.rb'
|
10
|
+
require 'lib/jkl/url_doc_handler.rb'
|
11
|
+
|
12
|
+
include Jkl
|
@@ -0,0 +1,32 @@
|
|
1
|
+
Given "I have a keyphrase '$text'" do |text|
|
2
|
+
@text = text
|
3
|
+
end
|
4
|
+
|
5
|
+
Given /^I have a sample BBC story$/ do
|
6
|
+
@text = File.open('features/mocks/bbc_story.html','r') {|f| f.readlines.to_s}
|
7
|
+
end
|
8
|
+
|
9
|
+
When /^I sanitize this text$/ do
|
10
|
+
@text = Jkl::sanitize @text
|
11
|
+
end
|
12
|
+
|
13
|
+
Then /^it should be ok$/ do
|
14
|
+
@text.should_not be_nil
|
15
|
+
@text.should_not == ""
|
16
|
+
end
|
17
|
+
|
18
|
+
Then "it should say '$text'" do |text|
|
19
|
+
@text.should == text
|
20
|
+
end
|
21
|
+
|
22
|
+
Given /^I have some script tag data$/ do
|
23
|
+
@text = <<-EOF;
|
24
|
+
some start stuff here
|
25
|
+
<script type="text/javascript" charset="utf-8">
|
26
|
+
function nofunction(){var bob;}
|
27
|
+
</script>
|
28
|
+
<p> some para stuff here </p>
|
29
|
+
some end stuff here
|
30
|
+
EOF
|
31
|
+
end
|
32
|
+
|
@@ -0,0 +1,17 @@
|
|
1
|
+
Given /^I have a mock twitter response$/ do
|
2
|
+
@response = File.open('features/mocks/twitter.json','r') {|f| f.readlines.to_s}
|
3
|
+
end
|
4
|
+
|
5
|
+
When /^I request trends data from twitter$/ do
|
6
|
+
@url = YAML::load_file('config/config.yml')['twitter']
|
7
|
+
@response = Jkl::get_from @url
|
8
|
+
end
|
9
|
+
|
10
|
+
Then /^I should see some trends$/ do
|
11
|
+
result = JSON.parse @response
|
12
|
+
trends = result['trends']
|
13
|
+
trends.each do |subject|
|
14
|
+
subject['name'].length.should > 1
|
15
|
+
subject['url'].length.should > 1
|
16
|
+
end
|
17
|
+
end
|
data/lib/jkl.rb
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
require "cgi"
|
2
|
+
require "jkl/rest_client.rb"
|
3
|
+
require "jkl/rss_client.rb"
|
4
|
+
require "jkl/calais_client.rb"
|
5
|
+
require "jkl/url_doc_handler.rb"
|
6
|
+
|
7
|
+
module Jkl
|
8
|
+
|
9
|
+
def self.headlines(feed, keyphrase)
|
10
|
+
get_from_as_xml "#{feed}#{keyphrase}"
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.pages(headlines)
|
14
|
+
items = get_items_from headlines
|
15
|
+
descriptions = ""
|
16
|
+
items.each do |item|
|
17
|
+
descriptions << attribute_from(item, :description).gsub("<![CDATA[","").gsub("]]>","")
|
18
|
+
end
|
19
|
+
descriptions
|
20
|
+
end
|
21
|
+
|
22
|
+
def self.descriptions(headlines)
|
23
|
+
items = get_items_from headlines
|
24
|
+
descriptions = []
|
25
|
+
items.each do |item|
|
26
|
+
descriptions << attribute_from(item, :description).gsub("<![CDATA[","").gsub("]]>","")
|
27
|
+
end
|
28
|
+
descriptions
|
29
|
+
end
|
30
|
+
|
31
|
+
def self.links(headlines)
|
32
|
+
items = get_items_from headlines
|
33
|
+
links = []
|
34
|
+
items.each do |item|
|
35
|
+
links << attribute_from(item, :link)
|
36
|
+
end
|
37
|
+
links
|
38
|
+
end
|
39
|
+
|
40
|
+
def self.tags(key, pages)
|
41
|
+
Calais.process_document(
|
42
|
+
:content => pages,
|
43
|
+
:content_type => :text,
|
44
|
+
:license_id => key
|
45
|
+
)
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
require "json"
|
2
|
+
require "rest_client"
|
3
|
+
require "calais"
|
4
|
+
|
5
|
+
module Jkl
|
6
|
+
|
7
|
+
def self.get_from_calais(content)
|
8
|
+
begin
|
9
|
+
license_id = YAML::load_file('config/keys.yml')['calais']
|
10
|
+
c_uri = URI.parse('http://api.opencalais.com/enlighten/rest/')
|
11
|
+
post_args = { 'licenseID' => license_id, 'content' => content,
|
12
|
+
'paramsXML' => paramsXML('application/json') }
|
13
|
+
post_to(c_uri, post_args)
|
14
|
+
rescue Exception => e
|
15
|
+
puts e
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.get_tag_from_json(response)
|
20
|
+
result = JSON.parse response
|
21
|
+
result.delete_if {|key, value| key == "doc" } # ditching the doc
|
22
|
+
cleaned_result = []
|
23
|
+
result.each do |key,tag|
|
24
|
+
tag = Jkl::clean_unwanted_items_from_hash tag
|
25
|
+
cleaned_result << tag
|
26
|
+
yield tag if block_given?
|
27
|
+
end
|
28
|
+
|
29
|
+
cleaned_result
|
30
|
+
end
|
31
|
+
|
32
|
+
def self.get_calais_metadata(response)
|
33
|
+
#ce = CalaisExtractor.new( response )
|
34
|
+
#ce.prettify
|
35
|
+
#TODO work out how to implement this
|
36
|
+
end
|
37
|
+
|
38
|
+
#jkl doesn't work with these aspects of the calais response, also removing blanks
|
39
|
+
def self.clean_unwanted_items_from_hash h
|
40
|
+
h.delete_if {|k, v| k == "relevance" }
|
41
|
+
h.delete_if {|k, v| k == "instances" }
|
42
|
+
h.delete_if {|k, v| v == "N/A"}
|
43
|
+
h.delete_if {|k, v| v == []}
|
44
|
+
h.delete_if {|k, v| v == ""}
|
45
|
+
h.delete_if {|k, v| k == "_typeGroup"}
|
46
|
+
h
|
47
|
+
end
|
48
|
+
|
49
|
+
private
|
50
|
+
|
51
|
+
def self.paramsXML(format)
|
52
|
+
<<-paramsXML;
|
53
|
+
<c:params xmlns:c="http://s.opencalais.com/1/pred/"
|
54
|
+
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
|
55
|
+
<c:processingDirectives
|
56
|
+
c:contentType="text/txt"
|
57
|
+
c:outputFormat="#{format}">
|
58
|
+
</c:processingDirectives>
|
59
|
+
<c:userDirectives />
|
60
|
+
<c:externalMetadata />
|
61
|
+
</c:params>
|
62
|
+
paramsXML
|
63
|
+
end
|
64
|
+
end
|