jakal 0.0.7
Sign up to get free protection for your applications and to get access to all the features.
- data/License.txt +22 -0
- data/README.rdoc +28 -0
- data/features/calais.feature +38 -0
- data/features/http.feature +32 -0
- data/features/mocks/bbc_story.html +2863 -0
- data/features/mocks/calais.json +2464 -0
- data/features/mocks/topix_rss.xml +47 -0
- data/features/mocks/twitter.json +11 -0
- data/features/processing.feature +16 -0
- data/features/sanitize-text.feature +53 -0
- data/features/step_definitions/calais_steps.rb +44 -0
- data/features/step_definitions/http_steps.rb +56 -0
- data/features/step_definitions/processing_steps.rb +30 -0
- data/features/step_definitions/require_steps.rb +12 -0
- data/features/step_definitions/sanitize-text_steps.rb +32 -0
- data/features/step_definitions/twitter_steps.rb +17 -0
- data/features/support/env.rb +10 -0
- data/lib/jkl.rb +48 -0
- data/lib/jkl/calais_client.rb +64 -0
- data/lib/jkl/rest_client.rb +36 -0
- data/lib/jkl/rss_client.rb +15 -0
- data/lib/jkl/url_doc_handler.rb +31 -0
- metadata +78 -0
@@ -0,0 +1,47 @@
|
|
1
|
+
<?xml version="1.0" encoding="ISO-8859-1"?>
|
2
|
+
<?xml-stylesheet href="/static/rss.3.xsl" type="text/xsl"?>
|
3
|
+
<rss xmlns:topix="http://www.topix.com/partners/rsscomment/" xmlns:georss="http://www.georss.org/georss" version="2.0">
|
4
|
+
<channel>
|
5
|
+
<title>Search for "london" </title>
|
6
|
+
<link>http://www.topix.com/search/article?q=london&x=0&y=0</link>
|
7
|
+
<topix:rsslink>http://www.topix.com/rss/search/article.xml?q=london&x=0&y=0</topix:rsslink>
|
8
|
+
<description>News continually updated from thousands of sources across the web</description>
|
9
|
+
<language>en-us</language>
|
10
|
+
<ttl>240</ttl>
|
11
|
+
<copyright>Copyright 2008, Topix</copyright>
|
12
|
+
<image>
|
13
|
+
<title>Topix</title>
|
14
|
+
<link>http://www.topix.com/</link>
|
15
|
+
<url>http://topix.cachefly.net/pics/topix_homepage_logo2.png</url>
|
16
|
+
</image>
|
17
|
+
<item>
|
18
|
+
<title>Major Michael Jackson tribute planned for Vienna</title>
|
19
|
+
<link>http://www.localnews8.com/Global/story.asp?S=10876507</link>
|
20
|
+
<description><![CDATA[The King of Pop will get a royal send-off next month in Vienna. Events promoter World Awards Media GmbH confirmed Monday that members of Michael Jackson's family and a "high-profile lineup of international stars" are planning a tribute concert in the Austrian capital.]]></description>
|
21
|
+
<source>KIFI</source>
|
22
|
+
<pubDate>Mon, 10 Aug 2009 15:21:58 GMT</pubDate>
|
23
|
+
<category>Jermaine Jackson</category>
|
24
|
+
<category>Michael Jackson</category>
|
25
|
+
<category>Pop/Rock</category>
|
26
|
+
<category>Black Entertainment</category>
|
27
|
+
<category>R-N-B</category>
|
28
|
+
<guid isPermaLink="false">C39RO2C8Q8NQR825</guid>
|
29
|
+
</item>
|
30
|
+
<item>
|
31
|
+
<title>LATEST: Man stabbed to death with machete in pub garden</title>
|
32
|
+
<link>http://www.thisislondon.co.uk/standard/article-23730262-details/LATEST%3A+Man+stabbed+to+death+with+machete+in+pub+garden/article.do</link>
|
33
|
+
<description><![CDATA[A man drinking with his girlfriend in a beer garden was hacked to death with machetes in front of horrified drinkers.]]></description>
|
34
|
+
<source>This is London</source>
|
35
|
+
<pubDate>Mon, 10 Aug 2009 15:21:51 GMT</pubDate>
|
36
|
+
<category>Greater London County, England</category>
|
37
|
+
<category>Cheshire County, England</category>
|
38
|
+
<category>England, United Kingdom</category>
|
39
|
+
<category>United Kingdom</category>
|
40
|
+
<category>Sutton, England</category>
|
41
|
+
<category>London, England</category>
|
42
|
+
<category>World News</category>
|
43
|
+
<category>Essex County, England</category>
|
44
|
+
<guid isPermaLink="false">KN03NCSH3KIVG416</guid>
|
45
|
+
</item>
|
46
|
+
</channel>
|
47
|
+
</rss>
|
@@ -0,0 +1,11 @@
|
|
1
|
+
{
|
2
|
+
"trends": [{
|
3
|
+
"name": "musicmonday",
|
4
|
+
"url": "http:\/\/search.twitter.com\/search?q=%23musicmonday"
|
5
|
+
},
|
6
|
+
{
|
7
|
+
"name": "GI Joe",
|
8
|
+
"url": "http:\/\/search.twitter.com\/search?q=%22GI+Joe%22+OR+Joe"
|
9
|
+
}],
|
10
|
+
"as_of": "Mon, 10 Aug 2009 15:04:54 +0000"
|
11
|
+
}
|
@@ -0,0 +1,16 @@
|
|
1
|
+
Feature: Processing features
|
2
|
+
In order to integrate our apps
|
3
|
+
As a developer
|
4
|
+
I want to make some requests and inspect some responses
|
5
|
+
|
6
|
+
@connection_needed
|
7
|
+
Scenario: end to end flow, keyphrase to tags
|
8
|
+
Given I have a keyphrase 'london restaurants'
|
9
|
+
When I request some RSS
|
10
|
+
Then I should receive some headlines
|
11
|
+
And I should be able to get the copy from the first headline
|
12
|
+
When I post to calais
|
13
|
+
Then I should receive some tags
|
14
|
+
And I should be able to persist these tags
|
15
|
+
When I generate a view of the recent keyword results
|
16
|
+
Then I should see a network graph
|
@@ -0,0 +1,53 @@
|
|
1
|
+
Feature: Processing features
|
2
|
+
In order to integrate our apps
|
3
|
+
As a developer
|
4
|
+
I want to make some requests and inspect some responses
|
5
|
+
|
6
|
+
@unit @text
|
7
|
+
Scenario: Sanitize some ok text
|
8
|
+
Given I have a keyphrase 'the cat sat on the mat'
|
9
|
+
When I sanitize this text
|
10
|
+
Then it should be ok
|
11
|
+
And it should say 'the cat sat on the mat'
|
12
|
+
|
13
|
+
@unit @text
|
14
|
+
Scenario: Sanitize some short text
|
15
|
+
Given I have a keyphrase 'the cat sat'
|
16
|
+
When I sanitize this text
|
17
|
+
Then it should say ''
|
18
|
+
|
19
|
+
@unit @text @wip
|
20
|
+
Scenario: Sanitize some text with tabs and spaces
|
21
|
+
Given I have a keyphrase 'the cat sat on the mat '
|
22
|
+
When I sanitize this text
|
23
|
+
Then it should say 'the cat sat on the mat'
|
24
|
+
|
25
|
+
@unit @text @wip
|
26
|
+
Scenario: Sanitize some short text with tabs and spaces
|
27
|
+
Given I have a keyphrase 'the cat sat on '
|
28
|
+
When I sanitize this text
|
29
|
+
Then it should say ''
|
30
|
+
|
31
|
+
@unit @text
|
32
|
+
Scenario: Sanitize some tagged short text
|
33
|
+
Given I have a keyphrase '<a href="a-link.html>the cat sat</a>'
|
34
|
+
When I sanitize this text
|
35
|
+
Then it should say ''
|
36
|
+
|
37
|
+
@unit @text
|
38
|
+
Scenario: Sanitize some tagged text
|
39
|
+
Given I have a keyphrase '<a href="a-link.html>the cat sat on the mat</a>'
|
40
|
+
When I sanitize this text
|
41
|
+
Then it should be ok
|
42
|
+
Then it should say 'the cat sat on the mat'
|
43
|
+
|
44
|
+
@unit @text @wip
|
45
|
+
Scenario: Remove script tags
|
46
|
+
Given I have some script tag data
|
47
|
+
When I sanitize this text
|
48
|
+
Then it should say ' some para stuff here '
|
49
|
+
|
50
|
+
Scenario: Clean a web page
|
51
|
+
Given I have a sample BBC story
|
52
|
+
When I sanitize this text
|
53
|
+
Then it should be ok
|
@@ -0,0 +1,44 @@
|
|
1
|
+
|
2
|
+
Given /^I have some simple text$/ do
|
3
|
+
@text = "Barack Obama said today that he expects there to be conflict within his new security team after confirming Hillary Clinton as his choice for US Secretary of State."
|
4
|
+
end
|
5
|
+
|
6
|
+
Given /^I have a sanitized sample BBC story$/ do
|
7
|
+
Given "I have a sample BBC story"
|
8
|
+
When "I sanitize this text"
|
9
|
+
end
|
10
|
+
|
11
|
+
Given /^I have a mock calais response$/ do
|
12
|
+
@response = File.open('features/mocks/calais.json','r') {|f| f.readlines.to_s}
|
13
|
+
end
|
14
|
+
|
15
|
+
When /^I post to calais$/ do
|
16
|
+
@response = Jkl::get_from_calais @text
|
17
|
+
end
|
18
|
+
|
19
|
+
When /^I remove the unwanted items$/ do
|
20
|
+
@processed_json = Jkl::clean_unwanted_items_from_hash(JSON.parse(@response))
|
21
|
+
end
|
22
|
+
|
23
|
+
Then /^there should no longer be any "([^\"]*)"$/ do |arg1|
|
24
|
+
@processed_json[arg1].should be_nil
|
25
|
+
end
|
26
|
+
|
27
|
+
Then /^I should receive some tags$/ do
|
28
|
+
Jkl::get_tag_from_json(@response) do |tag|
|
29
|
+
tag.should_not be_nil
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
Then /^there should be some "([^\"]*)" tags$/ do |arg1|
|
34
|
+
Jkl::get_tag_from_json(@response) {|tag|
|
35
|
+
#puts tag.inspect
|
36
|
+
tag.each{|k,v| puts "#{k} : #{v}" if k=='_type'}
|
37
|
+
}
|
38
|
+
end
|
39
|
+
|
40
|
+
Then /^I should be able to see the whole lot of tags as one block$/ do
|
41
|
+
tags = Jkl::get_tag_from_json(@response)
|
42
|
+
tags.length.should > 0
|
43
|
+
end
|
44
|
+
|
@@ -0,0 +1,56 @@
|
|
1
|
+
When /^I post some data to yahoo$/ do
|
2
|
+
@url = URI.parse('http://search.yahooapis.com/ContentAnalysisService/V1/termExtraction')
|
3
|
+
appid = LICENSE_ID = YAML::load_file('config/keys.yml')['yahoo']
|
4
|
+
context = URI.encode('Italian sculptors and painters of the renaissance favored the Virgin Mary for inspiration')
|
5
|
+
post_args = { 'appid' => appid, 'context' => context, 'output' => 'json' }
|
6
|
+
@response = Jkl::post_to @url, post_args
|
7
|
+
end
|
8
|
+
|
9
|
+
When /^I request some RSS$/ do
|
10
|
+
keyphrase = @keyphrase || "iraq"
|
11
|
+
url = "#{YAML::load_file('config/config.yml')['topix']}#{CGI::escape(keyphrase)}"
|
12
|
+
@response = Jkl::get_from_as_xml url
|
13
|
+
end
|
14
|
+
|
15
|
+
Given /^I have some RSS$/ do
|
16
|
+
raw = File.open('features/mocks/topix_rss.xml','r') {|f| f.readlines.to_s}
|
17
|
+
@response = Hpricot.XML raw
|
18
|
+
end
|
19
|
+
|
20
|
+
When /^I make a restful get request$/ do
|
21
|
+
url = "http://news.bbc.co.uk/1/hi/uk_politics/7677419.stm"
|
22
|
+
@response = Jkl::get_from url
|
23
|
+
end
|
24
|
+
|
25
|
+
When /^I request some trends$/ do
|
26
|
+
twitter_json_url = YAML::load_file('config/config.yml')['twitter']
|
27
|
+
output = JSON.parse Jkl::get_from twitter_json_url
|
28
|
+
@response = output['trends']
|
29
|
+
end
|
30
|
+
|
31
|
+
|
32
|
+
Then /^I should get a response$/ do
|
33
|
+
@response.should_not == nil
|
34
|
+
#puts @response.inspect
|
35
|
+
end
|
36
|
+
|
37
|
+
Then /^I should receive some headlines$/ do
|
38
|
+
@items = Jkl::get_items_from @response
|
39
|
+
@links = []
|
40
|
+
@items.each do |item|
|
41
|
+
@links << Jkl::attribute_from(item, :link)
|
42
|
+
end
|
43
|
+
@links.should_not == nil
|
44
|
+
@links.length.should > 0
|
45
|
+
end
|
46
|
+
|
47
|
+
Then /^I should be able to get the copy from the first headline$/ do
|
48
|
+
@response = Jkl::get_from @links[0]
|
49
|
+
@response.should_not be_nil
|
50
|
+
@response.should_not == ""
|
51
|
+
@text = Jkl::sanitize @response
|
52
|
+
end
|
53
|
+
|
54
|
+
Then /^I should see some text$/ do
|
55
|
+
@response.length.should > 0
|
56
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
|
2
|
+
############### pending steps below ################
|
3
|
+
|
4
|
+
|
5
|
+
When /^I request tags for the first story$/ do
|
6
|
+
r = Jkl::get_from_calais @story
|
7
|
+
Jkl::get_tag_from_json(get_from_calais(@story)) do |tag|
|
8
|
+
tag.each{|k,v| puts "#{k} : #{v}"}
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
When /^I request stories from Topix$/ do
|
13
|
+
search_term = 'london'
|
14
|
+
url = "#{YAML::load_file('config/config.yml')['topix']}#{search_term}"
|
15
|
+
@response = Jkl::get_from_as_xml url
|
16
|
+
end
|
17
|
+
|
18
|
+
When /^I get some news stories from the first keyword$/ do
|
19
|
+
search_term = @trend['name'].gsub('#','') #removing hash from start of trend name
|
20
|
+
search_term = 'london'
|
21
|
+
url = "#{YAML::load_file('config/config.yml')['topix']}#{search_term}"
|
22
|
+
rss_response = Jkl::get_from_as_xml url
|
23
|
+
items = Jkl::get_items_from rss_response
|
24
|
+
links = []
|
25
|
+
items.each do |item|
|
26
|
+
links << attribute_from(item, :link)
|
27
|
+
end
|
28
|
+
@story = Jkl::sanitize Jkl::from_doc Jkl::get_from links[0]
|
29
|
+
end
|
30
|
+
|
@@ -0,0 +1,12 @@
|
|
1
|
+
require 'hpricot'
|
2
|
+
require 'json'
|
3
|
+
require 'restclient'
|
4
|
+
require 'haml'
|
5
|
+
require 'cgi'
|
6
|
+
require 'lib/jkl.rb'
|
7
|
+
require 'lib/jkl/calais_client.rb'
|
8
|
+
require 'lib/jkl/rest_client.rb'
|
9
|
+
require 'lib/jkl/rss_client.rb'
|
10
|
+
require 'lib/jkl/url_doc_handler.rb'
|
11
|
+
|
12
|
+
include Jkl
|
@@ -0,0 +1,32 @@
|
|
1
|
+
Given "I have a keyphrase '$text'" do |text|
|
2
|
+
@text = text
|
3
|
+
end
|
4
|
+
|
5
|
+
Given /^I have a sample BBC story$/ do
|
6
|
+
@text = File.open('features/mocks/bbc_story.html','r') {|f| f.readlines.to_s}
|
7
|
+
end
|
8
|
+
|
9
|
+
When /^I sanitize this text$/ do
|
10
|
+
@text = Jkl::sanitize @text
|
11
|
+
end
|
12
|
+
|
13
|
+
Then /^it should be ok$/ do
|
14
|
+
@text.should_not be_nil
|
15
|
+
@text.should_not == ""
|
16
|
+
end
|
17
|
+
|
18
|
+
Then "it should say '$text'" do |text|
|
19
|
+
@text.should == text
|
20
|
+
end
|
21
|
+
|
22
|
+
Given /^I have some script tag data$/ do
|
23
|
+
@text = <<-EOF;
|
24
|
+
some start stuff here
|
25
|
+
<script type="text/javascript" charset="utf-8">
|
26
|
+
function nofunction(){var bob;}
|
27
|
+
</script>
|
28
|
+
<p> some para stuff here </p>
|
29
|
+
some end stuff here
|
30
|
+
EOF
|
31
|
+
end
|
32
|
+
|
@@ -0,0 +1,17 @@
|
|
1
|
+
Given /^I have a mock twitter response$/ do
|
2
|
+
@response = File.open('features/mocks/twitter.json','r') {|f| f.readlines.to_s}
|
3
|
+
end
|
4
|
+
|
5
|
+
When /^I request trends data from twitter$/ do
|
6
|
+
@url = YAML::load_file('config/config.yml')['twitter']
|
7
|
+
@response = Jkl::get_from @url
|
8
|
+
end
|
9
|
+
|
10
|
+
Then /^I should see some trends$/ do
|
11
|
+
result = JSON.parse @response
|
12
|
+
trends = result['trends']
|
13
|
+
trends.each do |subject|
|
14
|
+
subject['name'].length.should > 1
|
15
|
+
subject['url'].length.should > 1
|
16
|
+
end
|
17
|
+
end
|
data/lib/jkl.rb
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
require "cgi"
|
2
|
+
require "jkl/rest_client.rb"
|
3
|
+
require "jkl/rss_client.rb"
|
4
|
+
require "jkl/calais_client.rb"
|
5
|
+
require "jkl/url_doc_handler.rb"
|
6
|
+
|
7
|
+
module Jkl
|
8
|
+
|
9
|
+
def self.headlines(feed, keyphrase)
|
10
|
+
get_from_as_xml "#{feed}#{keyphrase}"
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.pages(headlines)
|
14
|
+
items = get_items_from headlines
|
15
|
+
descriptions = ""
|
16
|
+
items.each do |item|
|
17
|
+
descriptions << attribute_from(item, :description).gsub("<![CDATA[","").gsub("]]>","")
|
18
|
+
end
|
19
|
+
descriptions
|
20
|
+
end
|
21
|
+
|
22
|
+
def self.descriptions(headlines)
|
23
|
+
items = get_items_from headlines
|
24
|
+
descriptions = []
|
25
|
+
items.each do |item|
|
26
|
+
descriptions << attribute_from(item, :description).gsub("<![CDATA[","").gsub("]]>","")
|
27
|
+
end
|
28
|
+
descriptions
|
29
|
+
end
|
30
|
+
|
31
|
+
def self.links(headlines)
|
32
|
+
items = get_items_from headlines
|
33
|
+
links = []
|
34
|
+
items.each do |item|
|
35
|
+
links << attribute_from(item, :link)
|
36
|
+
end
|
37
|
+
links
|
38
|
+
end
|
39
|
+
|
40
|
+
def self.tags(key, pages)
|
41
|
+
Calais.process_document(
|
42
|
+
:content => pages,
|
43
|
+
:content_type => :text,
|
44
|
+
:license_id => key
|
45
|
+
)
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
require "json"
|
2
|
+
require "rest_client"
|
3
|
+
require "calais"
|
4
|
+
|
5
|
+
module Jkl
|
6
|
+
|
7
|
+
def self.get_from_calais(content)
|
8
|
+
begin
|
9
|
+
license_id = YAML::load_file('config/keys.yml')['calais']
|
10
|
+
c_uri = URI.parse('http://api.opencalais.com/enlighten/rest/')
|
11
|
+
post_args = { 'licenseID' => license_id, 'content' => content,
|
12
|
+
'paramsXML' => paramsXML('application/json') }
|
13
|
+
post_to(c_uri, post_args)
|
14
|
+
rescue Exception => e
|
15
|
+
puts e
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.get_tag_from_json(response)
|
20
|
+
result = JSON.parse response
|
21
|
+
result.delete_if {|key, value| key == "doc" } # ditching the doc
|
22
|
+
cleaned_result = []
|
23
|
+
result.each do |key,tag|
|
24
|
+
tag = Jkl::clean_unwanted_items_from_hash tag
|
25
|
+
cleaned_result << tag
|
26
|
+
yield tag if block_given?
|
27
|
+
end
|
28
|
+
|
29
|
+
cleaned_result
|
30
|
+
end
|
31
|
+
|
32
|
+
def self.get_calais_metadata(response)
|
33
|
+
#ce = CalaisExtractor.new( response )
|
34
|
+
#ce.prettify
|
35
|
+
#TODO work out how to implement this
|
36
|
+
end
|
37
|
+
|
38
|
+
#jkl doesn't work with these aspects of the calais response, also removing blanks
|
39
|
+
def self.clean_unwanted_items_from_hash h
|
40
|
+
h.delete_if {|k, v| k == "relevance" }
|
41
|
+
h.delete_if {|k, v| k == "instances" }
|
42
|
+
h.delete_if {|k, v| v == "N/A"}
|
43
|
+
h.delete_if {|k, v| v == []}
|
44
|
+
h.delete_if {|k, v| v == ""}
|
45
|
+
h.delete_if {|k, v| k == "_typeGroup"}
|
46
|
+
h
|
47
|
+
end
|
48
|
+
|
49
|
+
private
|
50
|
+
|
51
|
+
def self.paramsXML(format)
|
52
|
+
<<-paramsXML;
|
53
|
+
<c:params xmlns:c="http://s.opencalais.com/1/pred/"
|
54
|
+
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
|
55
|
+
<c:processingDirectives
|
56
|
+
c:contentType="text/txt"
|
57
|
+
c:outputFormat="#{format}">
|
58
|
+
</c:processingDirectives>
|
59
|
+
<c:userDirectives />
|
60
|
+
<c:externalMetadata />
|
61
|
+
</c:params>
|
62
|
+
paramsXML
|
63
|
+
end
|
64
|
+
end
|