jakal 0.0.7

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,47 @@
1
+ <?xml version="1.0" encoding="ISO-8859-1"?>
2
+ <?xml-stylesheet href="/static/rss.3.xsl" type="text/xsl"?>
3
+ <rss xmlns:topix="http://www.topix.com/partners/rsscomment/" xmlns:georss="http://www.georss.org/georss" version="2.0">
4
+ <channel>
5
+ <title>Search for "london" </title>
6
+ <link>http://www.topix.com/search/article?q=london&amp;x=0&amp;y=0</link>
7
+ <topix:rsslink>http://www.topix.com/rss/search/article.xml?q=london&amp;x=0&amp;y=0</topix:rsslink>
8
+ <description>News continually updated from thousands of sources across the web</description>
9
+ <language>en-us</language>
10
+ <ttl>240</ttl>
11
+ <copyright>Copyright 2008, Topix</copyright>
12
+ <image>
13
+ <title>Topix</title>
14
+ <link>http://www.topix.com/</link>
15
+ <url>http://topix.cachefly.net/pics/topix_homepage_logo2.png</url>
16
+ </image>
17
+ <item>
18
+ <title>Major Michael Jackson tribute planned for Vienna</title>
19
+ <link>http://www.localnews8.com/Global/story.asp?S=10876507</link>
20
+ <description><![CDATA[The King of Pop will get a royal send-off next month in Vienna. Events promoter World Awards Media GmbH confirmed Monday that members of Michael Jackson's family and a "high-profile lineup of international stars" are planning a tribute concert in the Austrian capital.]]></description>
21
+ <source>KIFI</source>
22
+ <pubDate>Mon, 10 Aug 2009 15:21:58 GMT</pubDate>
23
+ <category>Jermaine Jackson</category>
24
+ <category>Michael Jackson</category>
25
+ <category>Pop/Rock</category>
26
+ <category>Black Entertainment</category>
27
+ <category>R-N-B</category>
28
+ <guid isPermaLink="false">C39RO2C8Q8NQR825</guid>
29
+ </item>
30
+ <item>
31
+ <title>LATEST: Man stabbed to death with machete in pub garden</title>
32
+ <link>http://www.thisislondon.co.uk/standard/article-23730262-details/LATEST%3A+Man+stabbed+to+death+with+machete+in+pub+garden/article.do</link>
33
+ <description><![CDATA[A man drinking with his girlfriend in a beer garden was hacked to death with machetes in front of horrified drinkers.]]></description>
34
+ <source>This is London</source>
35
+ <pubDate>Mon, 10 Aug 2009 15:21:51 GMT</pubDate>
36
+ <category>Greater London County, England</category>
37
+ <category>Cheshire County, England</category>
38
+ <category>England, United Kingdom</category>
39
+ <category>United Kingdom</category>
40
+ <category>Sutton, England</category>
41
+ <category>London, England</category>
42
+ <category>World News</category>
43
+ <category>Essex County, England</category>
44
+ <guid isPermaLink="false">KN03NCSH3KIVG416</guid>
45
+ </item>
46
+ </channel>
47
+ </rss>
@@ -0,0 +1,11 @@
1
+ {
2
+ "trends": [{
3
+ "name": "musicmonday",
4
+ "url": "http:\/\/search.twitter.com\/search?q=%23musicmonday"
5
+ },
6
+ {
7
+ "name": "GI Joe",
8
+ "url": "http:\/\/search.twitter.com\/search?q=%22GI+Joe%22+OR+Joe"
9
+ }],
10
+ "as_of": "Mon, 10 Aug 2009 15:04:54 +0000"
11
+ }
@@ -0,0 +1,16 @@
1
+ Feature: Processing features
2
+ In order to integrate our apps
3
+ As a developer
4
+ I want to make some requests and inspect some responses
5
+
6
+ @connection_needed
7
+ Scenario: end to end flow, keyphrase to tags
8
+ Given I have a keyphrase 'london restaurants'
9
+ When I request some RSS
10
+ Then I should receive some headlines
11
+ And I should be able to get the copy from the first headline
12
+ When I post to calais
13
+ Then I should receive some tags
14
+ And I should be able to persist these tags
15
+ When I generate a view of the recent keyword results
16
+ Then I should see a network graph
@@ -0,0 +1,53 @@
1
+ Feature: Processing features
2
+ In order to integrate our apps
3
+ As a developer
4
+ I want to make some requests and inspect some responses
5
+
6
+ @unit @text
7
+ Scenario: Sanitize some ok text
8
+ Given I have a keyphrase 'the cat sat on the mat'
9
+ When I sanitize this text
10
+ Then it should be ok
11
+ And it should say 'the cat sat on the mat'
12
+
13
+ @unit @text
14
+ Scenario: Sanitize some short text
15
+ Given I have a keyphrase 'the cat sat'
16
+ When I sanitize this text
17
+ Then it should say ''
18
+
19
+ @unit @text @wip
20
+ Scenario: Sanitize some text with tabs and spaces
21
+ Given I have a keyphrase 'the cat sat on the mat '
22
+ When I sanitize this text
23
+ Then it should say 'the cat sat on the mat'
24
+
25
+ @unit @text @wip
26
+ Scenario: Sanitize some short text with tabs and spaces
27
+ Given I have a keyphrase 'the cat sat on '
28
+ When I sanitize this text
29
+ Then it should say ''
30
+
31
+ @unit @text
32
+ Scenario: Sanitize some tagged short text
33
+ Given I have a keyphrase '<a href="a-link.html>the cat sat</a>'
34
+ When I sanitize this text
35
+ Then it should say ''
36
+
37
+ @unit @text
38
+ Scenario: Sanitize some tagged text
39
+ Given I have a keyphrase '<a href="a-link.html>the cat sat on the mat</a>'
40
+ When I sanitize this text
41
+ Then it should be ok
42
+ Then it should say 'the cat sat on the mat'
43
+
44
+ @unit @text @wip
45
+ Scenario: Remove script tags
46
+ Given I have some script tag data
47
+ When I sanitize this text
48
+ Then it should say ' some para stuff here '
49
+
50
+ Scenario: Clean a web page
51
+ Given I have a sample BBC story
52
+ When I sanitize this text
53
+ Then it should be ok
@@ -0,0 +1,44 @@
1
+
2
+ Given /^I have some simple text$/ do
3
+ @text = "Barack Obama said today that he expects there to be conflict within his new security team after confirming Hillary Clinton as his choice for US Secretary of State."
4
+ end
5
+
6
+ Given /^I have a sanitized sample BBC story$/ do
7
+ Given "I have a sample BBC story"
8
+ When "I sanitize this text"
9
+ end
10
+
11
+ Given /^I have a mock calais response$/ do
12
+ @response = File.open('features/mocks/calais.json','r') {|f| f.readlines.to_s}
13
+ end
14
+
15
+ When /^I post to calais$/ do
16
+ @response = Jkl::get_from_calais @text
17
+ end
18
+
19
+ When /^I remove the unwanted items$/ do
20
+ @processed_json = Jkl::clean_unwanted_items_from_hash(JSON.parse(@response))
21
+ end
22
+
23
+ Then /^there should no longer be any "([^\"]*)"$/ do |arg1|
24
+ @processed_json[arg1].should be_nil
25
+ end
26
+
27
+ Then /^I should receive some tags$/ do
28
+ Jkl::get_tag_from_json(@response) do |tag|
29
+ tag.should_not be_nil
30
+ end
31
+ end
32
+
33
+ Then /^there should be some "([^\"]*)" tags$/ do |arg1|
34
+ Jkl::get_tag_from_json(@response) {|tag|
35
+ #puts tag.inspect
36
+ tag.each{|k,v| puts "#{k} : #{v}" if k=='_type'}
37
+ }
38
+ end
39
+
40
+ Then /^I should be able to see the whole lot of tags as one block$/ do
41
+ tags = Jkl::get_tag_from_json(@response)
42
+ tags.length.should > 0
43
+ end
44
+
@@ -0,0 +1,56 @@
1
+ When /^I post some data to yahoo$/ do
2
+ @url = URI.parse('http://search.yahooapis.com/ContentAnalysisService/V1/termExtraction')
3
+ appid = LICENSE_ID = YAML::load_file('config/keys.yml')['yahoo']
4
+ context = URI.encode('Italian sculptors and painters of the renaissance favored the Virgin Mary for inspiration')
5
+ post_args = { 'appid' => appid, 'context' => context, 'output' => 'json' }
6
+ @response = Jkl::post_to @url, post_args
7
+ end
8
+
9
+ When /^I request some RSS$/ do
10
+ keyphrase = @keyphrase || "iraq"
11
+ url = "#{YAML::load_file('config/config.yml')['topix']}#{CGI::escape(keyphrase)}"
12
+ @response = Jkl::get_from_as_xml url
13
+ end
14
+
15
+ Given /^I have some RSS$/ do
16
+ raw = File.open('features/mocks/topix_rss.xml','r') {|f| f.readlines.to_s}
17
+ @response = Hpricot.XML raw
18
+ end
19
+
20
+ When /^I make a restful get request$/ do
21
+ url = "http://news.bbc.co.uk/1/hi/uk_politics/7677419.stm"
22
+ @response = Jkl::get_from url
23
+ end
24
+
25
+ When /^I request some trends$/ do
26
+ twitter_json_url = YAML::load_file('config/config.yml')['twitter']
27
+ output = JSON.parse Jkl::get_from twitter_json_url
28
+ @response = output['trends']
29
+ end
30
+
31
+
32
+ Then /^I should get a response$/ do
33
+ @response.should_not == nil
34
+ #puts @response.inspect
35
+ end
36
+
37
+ Then /^I should receive some headlines$/ do
38
+ @items = Jkl::get_items_from @response
39
+ @links = []
40
+ @items.each do |item|
41
+ @links << Jkl::attribute_from(item, :link)
42
+ end
43
+ @links.should_not == nil
44
+ @links.length.should > 0
45
+ end
46
+
47
+ Then /^I should be able to get the copy from the first headline$/ do
48
+ @response = Jkl::get_from @links[0]
49
+ @response.should_not be_nil
50
+ @response.should_not == ""
51
+ @text = Jkl::sanitize @response
52
+ end
53
+
54
+ Then /^I should see some text$/ do
55
+ @response.length.should > 0
56
+ end
@@ -0,0 +1,30 @@
1
+
2
+ ############### pending steps below ################
3
+
4
+
5
+ When /^I request tags for the first story$/ do
6
+ r = Jkl::get_from_calais @story
7
+ Jkl::get_tag_from_json(get_from_calais(@story)) do |tag|
8
+ tag.each{|k,v| puts "#{k} : #{v}"}
9
+ end
10
+ end
11
+
12
+ When /^I request stories from Topix$/ do
13
+ search_term = 'london'
14
+ url = "#{YAML::load_file('config/config.yml')['topix']}#{search_term}"
15
+ @response = Jkl::get_from_as_xml url
16
+ end
17
+
18
+ When /^I get some news stories from the first keyword$/ do
19
+ search_term = @trend['name'].gsub('#','') #removing hash from start of trend name
20
+ search_term = 'london'
21
+ url = "#{YAML::load_file('config/config.yml')['topix']}#{search_term}"
22
+ rss_response = Jkl::get_from_as_xml url
23
+ items = Jkl::get_items_from rss_response
24
+ links = []
25
+ items.each do |item|
26
+ links << attribute_from(item, :link)
27
+ end
28
+ @story = Jkl::sanitize Jkl::from_doc Jkl::get_from links[0]
29
+ end
30
+
@@ -0,0 +1,12 @@
1
+ require 'hpricot'
2
+ require 'json'
3
+ require 'restclient'
4
+ require 'haml'
5
+ require 'cgi'
6
+ require 'lib/jkl.rb'
7
+ require 'lib/jkl/calais_client.rb'
8
+ require 'lib/jkl/rest_client.rb'
9
+ require 'lib/jkl/rss_client.rb'
10
+ require 'lib/jkl/url_doc_handler.rb'
11
+
12
+ include Jkl
@@ -0,0 +1,32 @@
1
+ Given "I have a keyphrase '$text'" do |text|
2
+ @text = text
3
+ end
4
+
5
+ Given /^I have a sample BBC story$/ do
6
+ @text = File.open('features/mocks/bbc_story.html','r') {|f| f.readlines.to_s}
7
+ end
8
+
9
+ When /^I sanitize this text$/ do
10
+ @text = Jkl::sanitize @text
11
+ end
12
+
13
+ Then /^it should be ok$/ do
14
+ @text.should_not be_nil
15
+ @text.should_not == ""
16
+ end
17
+
18
+ Then "it should say '$text'" do |text|
19
+ @text.should == text
20
+ end
21
+
22
+ Given /^I have some script tag data$/ do
23
+ @text = <<-EOF;
24
+ some start stuff here
25
+ <script type="text/javascript" charset="utf-8">
26
+ function nofunction(){var bob;}
27
+ </script>
28
+ <p> some para stuff here </p>
29
+ some end stuff here
30
+ EOF
31
+ end
32
+
@@ -0,0 +1,17 @@
1
+ Given /^I have a mock twitter response$/ do
2
+ @response = File.open('features/mocks/twitter.json','r') {|f| f.readlines.to_s}
3
+ end
4
+
5
+ When /^I request trends data from twitter$/ do
6
+ @url = YAML::load_file('config/config.yml')['twitter']
7
+ @response = Jkl::get_from @url
8
+ end
9
+
10
+ Then /^I should see some trends$/ do
11
+ result = JSON.parse @response
12
+ trends = result['trends']
13
+ trends.each do |subject|
14
+ subject['name'].length.should > 1
15
+ subject['url'].length.should > 1
16
+ end
17
+ end
@@ -0,0 +1,10 @@
1
+ gem 'rack-test'
2
+
3
+ require 'spec/expectations'
4
+ require 'rack/test'
5
+
6
+ class MyWorld
7
+ include Rack::Test::Methods
8
+ end
9
+
10
+ World{MyWorld.new}
data/lib/jkl.rb ADDED
@@ -0,0 +1,48 @@
1
+ require "cgi"
2
+ require "jkl/rest_client.rb"
3
+ require "jkl/rss_client.rb"
4
+ require "jkl/calais_client.rb"
5
+ require "jkl/url_doc_handler.rb"
6
+
7
+ module Jkl
8
+
9
+ def self.headlines(feed, keyphrase)
10
+ get_from_as_xml "#{feed}#{keyphrase}"
11
+ end
12
+
13
+ def self.pages(headlines)
14
+ items = get_items_from headlines
15
+ descriptions = ""
16
+ items.each do |item|
17
+ descriptions << attribute_from(item, :description).gsub("<![CDATA[","").gsub("]]>","")
18
+ end
19
+ descriptions
20
+ end
21
+
22
+ def self.descriptions(headlines)
23
+ items = get_items_from headlines
24
+ descriptions = []
25
+ items.each do |item|
26
+ descriptions << attribute_from(item, :description).gsub("<![CDATA[","").gsub("]]>","")
27
+ end
28
+ descriptions
29
+ end
30
+
31
+ def self.links(headlines)
32
+ items = get_items_from headlines
33
+ links = []
34
+ items.each do |item|
35
+ links << attribute_from(item, :link)
36
+ end
37
+ links
38
+ end
39
+
40
+ def self.tags(key, pages)
41
+ Calais.process_document(
42
+ :content => pages,
43
+ :content_type => :text,
44
+ :license_id => key
45
+ )
46
+ end
47
+
48
+ end
@@ -0,0 +1,64 @@
1
+ require "json"
2
+ require "rest_client"
3
+ require "calais"
4
+
5
+ module Jkl
6
+
7
+ def self.get_from_calais(content)
8
+ begin
9
+ license_id = YAML::load_file('config/keys.yml')['calais']
10
+ c_uri = URI.parse('http://api.opencalais.com/enlighten/rest/')
11
+ post_args = { 'licenseID' => license_id, 'content' => content,
12
+ 'paramsXML' => paramsXML('application/json') }
13
+ post_to(c_uri, post_args)
14
+ rescue Exception => e
15
+ puts e
16
+ end
17
+ end
18
+
19
+ def self.get_tag_from_json(response)
20
+ result = JSON.parse response
21
+ result.delete_if {|key, value| key == "doc" } # ditching the doc
22
+ cleaned_result = []
23
+ result.each do |key,tag|
24
+ tag = Jkl::clean_unwanted_items_from_hash tag
25
+ cleaned_result << tag
26
+ yield tag if block_given?
27
+ end
28
+
29
+ cleaned_result
30
+ end
31
+
32
+ def self.get_calais_metadata(response)
33
+ #ce = CalaisExtractor.new( response )
34
+ #ce.prettify
35
+ #TODO work out how to implement this
36
+ end
37
+
38
+ #jkl doesn't work with these aspects of the calais response, also removing blanks
39
+ def self.clean_unwanted_items_from_hash h
40
+ h.delete_if {|k, v| k == "relevance" }
41
+ h.delete_if {|k, v| k == "instances" }
42
+ h.delete_if {|k, v| v == "N/A"}
43
+ h.delete_if {|k, v| v == []}
44
+ h.delete_if {|k, v| v == ""}
45
+ h.delete_if {|k, v| k == "_typeGroup"}
46
+ h
47
+ end
48
+
49
+ private
50
+
51
+ def self.paramsXML(format)
52
+ <<-paramsXML;
53
+ <c:params xmlns:c="http://s.opencalais.com/1/pred/"
54
+ xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
55
+ <c:processingDirectives
56
+ c:contentType="text/txt"
57
+ c:outputFormat="#{format}">
58
+ </c:processingDirectives>
59
+ <c:userDirectives />
60
+ <c:externalMetadata />
61
+ </c:params>
62
+ paramsXML
63
+ end
64
+ end