jakal 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,47 @@
1
+ <?xml version="1.0" encoding="ISO-8859-1"?>
2
+ <?xml-stylesheet href="/static/rss.3.xsl" type="text/xsl"?>
3
+ <rss xmlns:topix="http://www.topix.com/partners/rsscomment/" xmlns:georss="http://www.georss.org/georss" version="2.0">
4
+ <channel>
5
+ <title>Search for "london" </title>
6
+ <link>http://www.topix.com/search/article?q=london&amp;x=0&amp;y=0</link>
7
+ <topix:rsslink>http://www.topix.com/rss/search/article.xml?q=london&amp;x=0&amp;y=0</topix:rsslink>
8
+ <description>News continually updated from thousands of sources across the web</description>
9
+ <language>en-us</language>
10
+ <ttl>240</ttl>
11
+ <copyright>Copyright 2008, Topix</copyright>
12
+ <image>
13
+ <title>Topix</title>
14
+ <link>http://www.topix.com/</link>
15
+ <url>http://topix.cachefly.net/pics/topix_homepage_logo2.png</url>
16
+ </image>
17
+ <item>
18
+ <title>Major Michael Jackson tribute planned for Vienna</title>
19
+ <link>http://www.localnews8.com/Global/story.asp?S=10876507</link>
20
+ <description><![CDATA[The King of Pop will get a royal send-off next month in Vienna. Events promoter World Awards Media GmbH confirmed Monday that members of Michael Jackson's family and a "high-profile lineup of international stars" are planning a tribute concert in the Austrian capital.]]></description>
21
+ <source>KIFI</source>
22
+ <pubDate>Mon, 10 Aug 2009 15:21:58 GMT</pubDate>
23
+ <category>Jermaine Jackson</category>
24
+ <category>Michael Jackson</category>
25
+ <category>Pop/Rock</category>
26
+ <category>Black Entertainment</category>
27
+ <category>R-N-B</category>
28
+ <guid isPermaLink="false">C39RO2C8Q8NQR825</guid>
29
+ </item>
30
+ <item>
31
+ <title>LATEST: Man stabbed to death with machete in pub garden</title>
32
+ <link>http://www.thisislondon.co.uk/standard/article-23730262-details/LATEST%3A+Man+stabbed+to+death+with+machete+in+pub+garden/article.do</link>
33
+ <description><![CDATA[A man drinking with his girlfriend in a beer garden was hacked to death with machetes in front of horrified drinkers.]]></description>
34
+ <source>This is London</source>
35
+ <pubDate>Mon, 10 Aug 2009 15:21:51 GMT</pubDate>
36
+ <category>Greater London County, England</category>
37
+ <category>Cheshire County, England</category>
38
+ <category>England, United Kingdom</category>
39
+ <category>United Kingdom</category>
40
+ <category>Sutton, England</category>
41
+ <category>London, England</category>
42
+ <category>World News</category>
43
+ <category>Essex County, England</category>
44
+ <guid isPermaLink="false">KN03NCSH3KIVG416</guid>
45
+ </item>
46
+ </channel>
47
+ </rss>
@@ -0,0 +1,11 @@
1
+ {
2
+ "trends": [{
3
+ "name": "musicmonday",
4
+ "url": "http:\/\/search.twitter.com\/search?q=%23musicmonday"
5
+ },
6
+ {
7
+ "name": "GI Joe",
8
+ "url": "http:\/\/search.twitter.com\/search?q=%22GI+Joe%22+OR+Joe"
9
+ }],
10
+ "as_of": "Mon, 10 Aug 2009 15:04:54 +0000"
11
+ }
@@ -0,0 +1,16 @@
1
+ Feature: Processing features
2
+ In order to integrate our apps
3
+ As a developer
4
+ I want to make some requests and inspect some responses
5
+
6
+ @connection_needed
7
+ Scenario: end to end flow, keyphrase to tags
8
+ Given I have a keyphrase 'london restaurants'
9
+ When I request some RSS
10
+ Then I should receive some headlines
11
+ And I should be able to get the copy from the first headline
12
+ When I post to calais
13
+ Then I should receive some tags
14
+ And I should be able to persist these tags
15
+ When I generate a view of the recent keyword results
16
+ Then I should see a network graph
@@ -0,0 +1,53 @@
1
+ Feature: Processing features
2
+ In order to integrate our apps
3
+ As a developer
4
+ I want to make some requests and inspect some responses
5
+
6
+ @unit @text
7
+ Scenario: Sanitize some ok text
8
+ Given I have a keyphrase 'the cat sat on the mat'
9
+ When I sanitize this text
10
+ Then it should be ok
11
+ And it should say 'the cat sat on the mat'
12
+
13
+ @unit @text
14
+ Scenario: Sanitize some short text
15
+ Given I have a keyphrase 'the cat sat'
16
+ When I sanitize this text
17
+ Then it should say ''
18
+
19
+ @unit @text @wip
20
+ Scenario: Sanitize some text with tabs and spaces
21
+ Given I have a keyphrase 'the cat sat on the mat '
22
+ When I sanitize this text
23
+ Then it should say 'the cat sat on the mat'
24
+
25
+ @unit @text @wip
26
+ Scenario: Sanitize some short text with tabs and spaces
27
+ Given I have a keyphrase 'the cat sat on '
28
+ When I sanitize this text
29
+ Then it should say ''
30
+
31
+ @unit @text
32
+ Scenario: Sanitize some tagged short text
33
+ Given I have a keyphrase '<a href="a-link.html>the cat sat</a>'
34
+ When I sanitize this text
35
+ Then it should say ''
36
+
37
+ @unit @text
38
+ Scenario: Sanitize some tagged text
39
+ Given I have a keyphrase '<a href="a-link.html>the cat sat on the mat</a>'
40
+ When I sanitize this text
41
+ Then it should be ok
42
+ Then it should say 'the cat sat on the mat'
43
+
44
+ @unit @text @wip
45
+ Scenario: Remove script tags
46
+ Given I have some script tag data
47
+ When I sanitize this text
48
+ Then it should say ' some para stuff here '
49
+
50
+ Scenario: Clean a web page
51
+ Given I have a sample BBC story
52
+ When I sanitize this text
53
+ Then it should be ok
@@ -0,0 +1,44 @@
1
+
2
+ Given /^I have some simple text$/ do
3
+ @text = "Barack Obama said today that he expects there to be conflict within his new security team after confirming Hillary Clinton as his choice for US Secretary of State."
4
+ end
5
+
6
+ Given /^I have a sanitized sample BBC story$/ do
7
+ Given "I have a sample BBC story"
8
+ When "I sanitize this text"
9
+ end
10
+
11
+ Given /^I have a mock calais response$/ do
12
+ @response = File.open('features/mocks/calais.json','r') {|f| f.readlines.to_s}
13
+ end
14
+
15
+ When /^I post to calais$/ do
16
+ @response = Jkl::get_from_calais @text
17
+ end
18
+
19
+ When /^I remove the unwanted items$/ do
20
+ @processed_json = Jkl::clean_unwanted_items_from_hash(JSON.parse(@response))
21
+ end
22
+
23
+ Then /^there should no longer be any "([^\"]*)"$/ do |arg1|
24
+ @processed_json[arg1].should be_nil
25
+ end
26
+
27
+ Then /^I should receive some tags$/ do
28
+ Jkl::get_tag_from_json(@response) do |tag|
29
+ tag.should_not be_nil
30
+ end
31
+ end
32
+
33
+ Then /^there should be some "([^\"]*)" tags$/ do |arg1|
34
+ Jkl::get_tag_from_json(@response) {|tag|
35
+ #puts tag.inspect
36
+ tag.each{|k,v| puts "#{k} : #{v}" if k=='_type'}
37
+ }
38
+ end
39
+
40
+ Then /^I should be able to see the whole lot of tags as one block$/ do
41
+ tags = Jkl::get_tag_from_json(@response)
42
+ tags.length.should > 0
43
+ end
44
+
@@ -0,0 +1,56 @@
1
+ When /^I post some data to yahoo$/ do
2
+ @url = URI.parse('http://search.yahooapis.com/ContentAnalysisService/V1/termExtraction')
3
+ appid = LICENSE_ID = YAML::load_file('config/keys.yml')['yahoo']
4
+ context = URI.encode('Italian sculptors and painters of the renaissance favored the Virgin Mary for inspiration')
5
+ post_args = { 'appid' => appid, 'context' => context, 'output' => 'json' }
6
+ @response = Jkl::post_to @url, post_args
7
+ end
8
+
9
+ When /^I request some RSS$/ do
10
+ keyphrase = @keyphrase || "iraq"
11
+ url = "#{YAML::load_file('config/config.yml')['topix']}#{CGI::escape(keyphrase)}"
12
+ @response = Jkl::get_from_as_xml url
13
+ end
14
+
15
+ Given /^I have some RSS$/ do
16
+ raw = File.open('features/mocks/topix_rss.xml','r') {|f| f.readlines.to_s}
17
+ @response = Hpricot.XML raw
18
+ end
19
+
20
+ When /^I make a restful get request$/ do
21
+ url = "http://news.bbc.co.uk/1/hi/uk_politics/7677419.stm"
22
+ @response = Jkl::get_from url
23
+ end
24
+
25
+ When /^I request some trends$/ do
26
+ twitter_json_url = YAML::load_file('config/config.yml')['twitter']
27
+ output = JSON.parse Jkl::get_from twitter_json_url
28
+ @response = output['trends']
29
+ end
30
+
31
+
32
+ Then /^I should get a response$/ do
33
+ @response.should_not == nil
34
+ #puts @response.inspect
35
+ end
36
+
37
+ Then /^I should receive some headlines$/ do
38
+ @items = Jkl::get_items_from @response
39
+ @links = []
40
+ @items.each do |item|
41
+ @links << Jkl::attribute_from(item, :link)
42
+ end
43
+ @links.should_not == nil
44
+ @links.length.should > 0
45
+ end
46
+
47
+ Then /^I should be able to get the copy from the first headline$/ do
48
+ @response = Jkl::get_from @links[0]
49
+ @response.should_not be_nil
50
+ @response.should_not == ""
51
+ @text = Jkl::sanitize @response
52
+ end
53
+
54
+ Then /^I should see some text$/ do
55
+ @response.length.should > 0
56
+ end
@@ -0,0 +1,30 @@
1
+
2
+ ############### pending steps below ################
3
+
4
+
5
+ When /^I request tags for the first story$/ do
6
+ r = Jkl::get_from_calais @story
7
+ Jkl::get_tag_from_json(get_from_calais(@story)) do |tag|
8
+ tag.each{|k,v| puts "#{k} : #{v}"}
9
+ end
10
+ end
11
+
12
+ When /^I request stories from Topix$/ do
13
+ search_term = 'london'
14
+ url = "#{YAML::load_file('config/config.yml')['topix']}#{search_term}"
15
+ @response = Jkl::get_from_as_xml url
16
+ end
17
+
18
+ When /^I get some news stories from the first keyword$/ do
19
+ search_term = @trend['name'].gsub('#','') #removing hash from start of trend name
20
+ search_term = 'london'
21
+ url = "#{YAML::load_file('config/config.yml')['topix']}#{search_term}"
22
+ rss_response = Jkl::get_from_as_xml url
23
+ items = Jkl::get_items_from rss_response
24
+ links = []
25
+ items.each do |item|
26
+ links << attribute_from(item, :link)
27
+ end
28
+ @story = Jkl::sanitize Jkl::from_doc Jkl::get_from links[0]
29
+ end
30
+
@@ -0,0 +1,12 @@
1
+ require 'hpricot'
2
+ require 'json'
3
+ require 'restclient'
4
+ require 'haml'
5
+ require 'cgi'
6
+ require 'lib/jkl.rb'
7
+ require 'lib/jkl/calais_client.rb'
8
+ require 'lib/jkl/rest_client.rb'
9
+ require 'lib/jkl/rss_client.rb'
10
+ require 'lib/jkl/url_doc_handler.rb'
11
+
12
+ include Jkl
@@ -0,0 +1,32 @@
1
+ Given "I have a keyphrase '$text'" do |text|
2
+ @text = text
3
+ end
4
+
5
+ Given /^I have a sample BBC story$/ do
6
+ @text = File.open('features/mocks/bbc_story.html','r') {|f| f.readlines.to_s}
7
+ end
8
+
9
+ When /^I sanitize this text$/ do
10
+ @text = Jkl::sanitize @text
11
+ end
12
+
13
+ Then /^it should be ok$/ do
14
+ @text.should_not be_nil
15
+ @text.should_not == ""
16
+ end
17
+
18
+ Then "it should say '$text'" do |text|
19
+ @text.should == text
20
+ end
21
+
22
+ Given /^I have some script tag data$/ do
23
+ @text = <<-EOF;
24
+ some start stuff here
25
+ <script type="text/javascript" charset="utf-8">
26
+ function nofunction(){var bob;}
27
+ </script>
28
+ <p> some para stuff here </p>
29
+ some end stuff here
30
+ EOF
31
+ end
32
+
@@ -0,0 +1,17 @@
1
+ Given /^I have a mock twitter response$/ do
2
+ @response = File.open('features/mocks/twitter.json','r') {|f| f.readlines.to_s}
3
+ end
4
+
5
+ When /^I request trends data from twitter$/ do
6
+ @url = YAML::load_file('config/config.yml')['twitter']
7
+ @response = Jkl::get_from @url
8
+ end
9
+
10
+ Then /^I should see some trends$/ do
11
+ result = JSON.parse @response
12
+ trends = result['trends']
13
+ trends.each do |subject|
14
+ subject['name'].length.should > 1
15
+ subject['url'].length.should > 1
16
+ end
17
+ end
@@ -0,0 +1,10 @@
1
+ gem 'rack-test'
2
+
3
+ require 'spec/expectations'
4
+ require 'rack/test'
5
+
6
+ class MyWorld
7
+ include Rack::Test::Methods
8
+ end
9
+
10
+ World{MyWorld.new}
data/lib/jkl.rb ADDED
@@ -0,0 +1,48 @@
1
+ require "cgi"
2
+ require "jkl/rest_client.rb"
3
+ require "jkl/rss_client.rb"
4
+ require "jkl/calais_client.rb"
5
+ require "jkl/url_doc_handler.rb"
6
+
7
+ module Jkl
8
+
9
+ def self.headlines(feed, keyphrase)
10
+ get_from_as_xml "#{feed}#{keyphrase}"
11
+ end
12
+
13
+ def self.pages(headlines)
14
+ items = get_items_from headlines
15
+ descriptions = ""
16
+ items.each do |item|
17
+ descriptions << attribute_from(item, :description).gsub("<![CDATA[","").gsub("]]>","")
18
+ end
19
+ descriptions
20
+ end
21
+
22
+ def self.descriptions(headlines)
23
+ items = get_items_from headlines
24
+ descriptions = []
25
+ items.each do |item|
26
+ descriptions << attribute_from(item, :description).gsub("<![CDATA[","").gsub("]]>","")
27
+ end
28
+ descriptions
29
+ end
30
+
31
+ def self.links(headlines)
32
+ items = get_items_from headlines
33
+ links = []
34
+ items.each do |item|
35
+ links << attribute_from(item, :link)
36
+ end
37
+ links
38
+ end
39
+
40
+ def self.tags(key, pages)
41
+ Calais.process_document(
42
+ :content => pages,
43
+ :content_type => :text,
44
+ :license_id => key
45
+ )
46
+ end
47
+
48
+ end
@@ -0,0 +1,64 @@
1
+ require "json"
2
+ require "rest_client"
3
+ require "calais"
4
+
5
+ module Jkl
6
+
7
+ def self.get_from_calais(content)
8
+ begin
9
+ license_id = YAML::load_file('config/keys.yml')['calais']
10
+ c_uri = URI.parse('http://api.opencalais.com/enlighten/rest/')
11
+ post_args = { 'licenseID' => license_id, 'content' => content,
12
+ 'paramsXML' => paramsXML('application/json') }
13
+ post_to(c_uri, post_args)
14
+ rescue Exception => e
15
+ puts e
16
+ end
17
+ end
18
+
19
+ def self.get_tag_from_json(response)
20
+ result = JSON.parse response
21
+ result.delete_if {|key, value| key == "doc" } # ditching the doc
22
+ cleaned_result = []
23
+ result.each do |key,tag|
24
+ tag = Jkl::clean_unwanted_items_from_hash tag
25
+ cleaned_result << tag
26
+ yield tag if block_given?
27
+ end
28
+
29
+ cleaned_result
30
+ end
31
+
32
+ def self.get_calais_metadata(response)
33
+ #ce = CalaisExtractor.new( response )
34
+ #ce.prettify
35
+ #TODO work out how to implement this
36
+ end
37
+
38
+ #jkl doesn't work with these aspects of the calais response, also removing blanks
39
+ def self.clean_unwanted_items_from_hash h
40
+ h.delete_if {|k, v| k == "relevance" }
41
+ h.delete_if {|k, v| k == "instances" }
42
+ h.delete_if {|k, v| v == "N/A"}
43
+ h.delete_if {|k, v| v == []}
44
+ h.delete_if {|k, v| v == ""}
45
+ h.delete_if {|k, v| k == "_typeGroup"}
46
+ h
47
+ end
48
+
49
+ private
50
+
51
+ def self.paramsXML(format)
52
+ <<-paramsXML;
53
+ <c:params xmlns:c="http://s.opencalais.com/1/pred/"
54
+ xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
55
+ <c:processingDirectives
56
+ c:contentType="text/txt"
57
+ c:outputFormat="#{format}">
58
+ </c:processingDirectives>
59
+ <c:userDirectives />
60
+ <c:externalMetadata />
61
+ </c:params>
62
+ paramsXML
63
+ end
64
+ end