RubyGems - jakal - Versions diffs - 0.0.7 - Mend

jakal 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

data/License.txt +22 -0
data/README.rdoc +28 -0
data/features/calais.feature +38 -0
data/features/http.feature +32 -0
data/features/mocks/bbc_story.html +2863 -0
data/features/mocks/calais.json +2464 -0
data/features/mocks/topix_rss.xml +47 -0
data/features/mocks/twitter.json +11 -0
data/features/processing.feature +16 -0
data/features/sanitize-text.feature +53 -0
data/features/step_definitions/calais_steps.rb +44 -0
data/features/step_definitions/http_steps.rb +56 -0
data/features/step_definitions/processing_steps.rb +30 -0
data/features/step_definitions/require_steps.rb +12 -0
data/features/step_definitions/sanitize-text_steps.rb +32 -0
data/features/step_definitions/twitter_steps.rb +17 -0
data/features/support/env.rb +10 -0
data/lib/jkl.rb +48 -0
data/lib/jkl/calais_client.rb +64 -0
data/lib/jkl/rest_client.rb +36 -0
data/lib/jkl/rss_client.rb +15 -0
data/lib/jkl/url_doc_handler.rb +31 -0
metadata +78 -0

data/features/mocks/topix_rss.xml ADDED Viewed

@@ -0,0 +1,47 @@
+<?xml version="1.0" encoding="ISO-8859-1"?>
+<?xml-stylesheet href="/static/rss.3.xsl" type="text/xsl"?>
+<rss xmlns:topix="http://www.topix.com/partners/rsscomment/" xmlns:georss="http://www.georss.org/georss" version="2.0">
+	<channel>
+		<title>Search for "london" </title>
+		<link>http://www.topix.com/search/article?q=london&amp;x=0&amp;y=0</link>
+		<topix:rsslink>http://www.topix.com/rss/search/article.xml?q=london&amp;x=0&amp;y=0</topix:rsslink>
+		<description>News continually updated from thousands of sources across the web</description>
+		<language>en-us</language>
+		<ttl>240</ttl>
+		<copyright>Copyright 2008, Topix</copyright>
+		<image>
+			<title>Topix</title>
+			<link>http://www.topix.com/</link>
+			<url>http://topix.cachefly.net/pics/topix_homepage_logo2.png</url>
+		</image>
+		<item>
+			<title>Major Michael Jackson tribute planned for Vienna</title>
+			<link>http://www.localnews8.com/Global/story.asp?S=10876507</link>
+			<description><![CDATA[The King of Pop will get a royal send-off next month in Vienna. Events promoter World Awards Media GmbH confirmed Monday that members of Michael Jackson's family and a "high-profile lineup of international stars" are planning a tribute concert in the Austrian capital.]]></description>
+			<source>KIFI</source>
+			<pubDate>Mon, 10 Aug 2009 15:21:58 GMT</pubDate>
+			<category>Jermaine Jackson</category>
+			<category>Michael Jackson</category>
+			<category>Pop/Rock</category>
+			<category>Black Entertainment</category>
+			<category>R-N-B</category>
+			<guid isPermaLink="false">C39RO2C8Q8NQR825</guid>
+		</item>
+		<item>
+			<title>LATEST: Man stabbed to death with machete in pub garden</title>
+			<link>http://www.thisislondon.co.uk/standard/article-23730262-details/LATEST%3A+Man+stabbed+to+death+with+machete+in+pub+garden/article.do</link>
+			<description><![CDATA[A man drinking with his girlfriend in a beer garden was hacked to death with machetes in front of horrified drinkers.]]></description>
+			<source>This is London</source>
+			<pubDate>Mon, 10 Aug 2009 15:21:51 GMT</pubDate>
+			<category>Greater London County, England</category>
+			<category>Cheshire County, England</category>
+			<category>England, United Kingdom</category>
+			<category>United Kingdom</category>
+			<category>Sutton, England</category>
+			<category>London, England</category>
+			<category>World News</category>
+			<category>Essex County, England</category>
+			<guid isPermaLink="false">KN03NCSH3KIVG416</guid>
+		</item>
+	</channel>
+</rss>

data/features/mocks/twitter.json ADDED Viewed

@@ -0,0 +1,11 @@
+{
+    "trends": [{
+        "name": "musicmonday",
+        "url": "http:\/\/search.twitter.com\/search?q=%23musicmonday"
+    },
+    {
+        "name": "GI Joe",
+        "url": "http:\/\/search.twitter.com\/search?q=%22GI+Joe%22+OR+Joe"
+    }],
+    "as_of": "Mon, 10 Aug 2009 15:04:54 +0000"
+}

data/features/processing.feature ADDED Viewed

@@ -0,0 +1,16 @@
+Feature: Processing features
+  In order to integrate our apps
+  As a developer
+  I want to make some requests and inspect some responses
+  @connection_needed
+  Scenario: end to end flow, keyphrase to tags
+	Given I have a keyphrase 'london restaurants'
+	When I request some RSS
+	Then I should receive some headlines
+	And I should be able to get the copy from the first headline
+	When I post to calais
+	Then I should receive some tags
+	And I should be able to persist these tags
+	When I generate a view of the recent keyword results
+	Then I should see a network graph

data/features/sanitize-text.feature ADDED Viewed

@@ -0,0 +1,53 @@
+Feature: Processing features
+  In order to integrate our apps
+  As a developer
+  I want to make some requests and inspect some responses
+	@unit @text
+	Scenario: Sanitize some ok text
+		Given I have a keyphrase 'the cat sat on the mat'
+		When I sanitize this text
+		Then it should be ok
+		And it should say 'the cat sat on the mat'
+	@unit @text
+	Scenario: Sanitize some short text
+		Given I have a keyphrase 'the cat sat'
+		When I sanitize this text
+		Then it should say ''
+	@unit @text @wip
+		Scenario: Sanitize some text with tabs and spaces
+		Given I have a keyphrase 'the cat sat on 						the mat            '
+		When I sanitize this text
+		Then it should say 'the cat sat on the mat'
+	@unit @text @wip
+		Scenario: Sanitize some short text with tabs and spaces
+		Given I have a keyphrase 'the   cat sat on 						           '
+		When I sanitize this text
+		Then it should say ''
+	@unit @text
+	Scenario: Sanitize some tagged short text
+		Given I have a keyphrase '<a href="a-link.html>the cat sat</a>'
+		When I sanitize this text
+		Then it should say ''
+	@unit @text
+	Scenario: Sanitize some tagged text
+		Given I have a keyphrase '<a href="a-link.html>the cat sat on the mat</a>'
+		When I sanitize this text
+		Then it should be ok
+		Then it should say 'the cat sat on the mat'
+	@unit @text @wip
+	Scenario: Remove script tags
+	  Given I have some script tag data
+	  When I sanitize this text
+	  Then it should say ' some para stuff here '
+	Scenario: Clean a web page
+		Given I have a sample BBC story
+		When I sanitize this text
+		Then it should be ok

data/features/step_definitions/calais_steps.rb ADDED Viewed

@@ -0,0 +1,44 @@
+Given /^I have some simple text$/ do
+  @text = "Barack Obama said today that he expects there to be conflict within his new security team after confirming Hillary Clinton as his choice for US Secretary of State."
+end
+Given /^I have a sanitized sample BBC story$/ do
+  Given "I have a sample BBC story"
+	When "I sanitize this text"
+end
+Given /^I have a mock calais response$/ do
+  @response = File.open('features/mocks/calais.json','r') {|f| f.readlines.to_s}
+end
+When /^I post to calais$/ do
+  @response = Jkl::get_from_calais @text
+end
+When /^I remove the unwanted items$/ do
+  @processed_json = Jkl::clean_unwanted_items_from_hash(JSON.parse(@response))
+end
+Then /^there should no longer be any "([^\"]*)"$/ do |arg1|
+  @processed_json[arg1].should be_nil
+end
+Then /^I should receive some tags$/ do
+  Jkl::get_tag_from_json(@response) do |tag|
+    tag.should_not be_nil
+  end
+end
+Then /^there should be some "([^\"]*)" tags$/ do |arg1|
+  Jkl::get_tag_from_json(@response) {|tag|
+    #puts tag.inspect
+    tag.each{|k,v| puts "#{k} : #{v}" if k=='_type'}
+  }
+end
+Then /^I should be able to see the whole lot of tags as one block$/ do
+  tags = Jkl::get_tag_from_json(@response)
+  tags.length.should > 0
+end

data/features/step_definitions/http_steps.rb ADDED Viewed

@@ -0,0 +1,56 @@
+When /^I post some data to yahoo$/ do
+  @url = URI.parse('http://search.yahooapis.com/ContentAnalysisService/V1/termExtraction')
+  appid = LICENSE_ID = YAML::load_file('config/keys.yml')['yahoo']
+  context = URI.encode('Italian sculptors and painters of the renaissance favored the Virgin Mary for inspiration')
+  post_args = { 'appid' => appid, 'context' => context, 'output' => 'json' }
+  @response = Jkl::post_to @url, post_args
+end
+When /^I request some RSS$/ do
+  keyphrase = @keyphrase || "iraq"
+  url = "#{YAML::load_file('config/config.yml')['topix']}#{CGI::escape(keyphrase)}"
+  @response = Jkl::get_from_as_xml url
+end
+Given /^I have some RSS$/ do
+  raw = File.open('features/mocks/topix_rss.xml','r') {|f| f.readlines.to_s}
+  @response = Hpricot.XML raw
+end
+When /^I make a restful get request$/ do
+  url = "http://news.bbc.co.uk/1/hi/uk_politics/7677419.stm"
+  @response = Jkl::get_from url
+end
+When /^I request some trends$/ do
+  twitter_json_url = YAML::load_file('config/config.yml')['twitter']
+  output = JSON.parse Jkl::get_from twitter_json_url
+  @response = output['trends']
+end
+Then /^I should get a response$/ do
+  @response.should_not == nil
+  #puts @response.inspect
+end
+Then /^I should receive some headlines$/ do
+  @items = Jkl::get_items_from @response
+  @links = []
+  @items.each do |item|
+    @links << Jkl::attribute_from(item, :link)
+  end
+  @links.should_not == nil
+  @links.length.should > 0
+end
+Then /^I should be able to get the copy from the first headline$/ do
+  @response = Jkl::get_from @links[0]
+  @response.should_not be_nil
+  @response.should_not == ""
+  @text = Jkl::sanitize @response
+end
+Then /^I should see some text$/ do
+  @response.length.should > 0
+end

data/features/step_definitions/processing_steps.rb ADDED Viewed

@@ -0,0 +1,30 @@
+############### pending steps below ################
+When /^I request tags for the first story$/ do
+  r = Jkl::get_from_calais @story
+  Jkl::get_tag_from_json(get_from_calais(@story)) do |tag|
+    tag.each{|k,v| puts "#{k} : #{v}"}
+  end
+end
+When /^I request stories from Topix$/ do
+  search_term = 'london'
+  url = "#{YAML::load_file('config/config.yml')['topix']}#{search_term}"
+  @response = Jkl::get_from_as_xml url
+end
+When /^I get some news stories from the first keyword$/ do
+  search_term = @trend['name'].gsub('#','') #removing hash from start of trend name
+  search_term = 'london'
+  url = "#{YAML::load_file('config/config.yml')['topix']}#{search_term}"
+  rss_response = Jkl::get_from_as_xml url
+  items = Jkl::get_items_from rss_response
+  links = []
+  items.each do |item|
+    links << attribute_from(item, :link)
+  end
+  @story = Jkl::sanitize Jkl::from_doc Jkl::get_from links[0]
+end

data/features/step_definitions/require_steps.rb ADDED Viewed

@@ -0,0 +1,12 @@
+require 'hpricot'
+require 'json'
+require 'restclient'
+require 'haml'
+require 'cgi'
+require 'lib/jkl.rb'
+require 'lib/jkl/calais_client.rb'
+require 'lib/jkl/rest_client.rb'
+require 'lib/jkl/rss_client.rb'
+require 'lib/jkl/url_doc_handler.rb'
+include Jkl

data/features/step_definitions/sanitize-text_steps.rb ADDED Viewed

@@ -0,0 +1,32 @@
+Given "I have a keyphrase '$text'" do |text|
+  @text = text
+end
+Given /^I have a sample BBC story$/ do
+  @text = File.open('features/mocks/bbc_story.html','r') {|f| f.readlines.to_s}
+end
+When /^I sanitize this text$/ do
+  @text = Jkl::sanitize @text
+end
+Then /^it should be ok$/ do
+  @text.should_not be_nil
+  @text.should_not == ""
+end
+Then "it should say '$text'" do |text|
+  @text.should == text
+end
+Given /^I have some script tag data$/ do
+  @text = <<-EOF;
+  some start stuff here
+  <script type="text/javascript" charset="utf-8">
+   function nofunction(){var bob;}
+  </script>
+  <p> some para stuff here </p>
+  some end stuff here
+    EOF
+end

data/features/step_definitions/twitter_steps.rb ADDED Viewed

@@ -0,0 +1,17 @@
+Given /^I have a mock twitter response$/ do
+  @response = File.open('features/mocks/twitter.json','r') {|f| f.readlines.to_s}
+end
+When /^I request trends data from twitter$/ do
+  @url = YAML::load_file('config/config.yml')['twitter']
+  @response = Jkl::get_from @url
+end
+Then /^I should see some trends$/ do
+  result = JSON.parse @response
+  trends = result['trends']
+  trends.each do |subject|
+    subject['name'].length.should > 1
+    subject['url'].length.should > 1
+  end
+end

data/features/support/env.rb ADDED Viewed

@@ -0,0 +1,10 @@
+gem 'rack-test'
+require 'spec/expectations'
+require 'rack/test'
+class MyWorld
+  include Rack::Test::Methods
+end
+World{MyWorld.new}

data/lib/jkl.rb ADDED Viewed

@@ -0,0 +1,48 @@
+require "cgi"
+require "jkl/rest_client.rb"
+require "jkl/rss_client.rb"
+require "jkl/calais_client.rb"
+require "jkl/url_doc_handler.rb"
+module Jkl
+  def self.headlines(feed, keyphrase)
+    get_from_as_xml "#{feed}#{keyphrase}"
+  end
+  def self.pages(headlines)
+    items = get_items_from headlines
+    descriptions = ""
+    items.each do |item|
+      descriptions << attribute_from(item, :description).gsub("<![CDATA[","").gsub("]]>","")
+    end
+    descriptions
+  end
+  def self.descriptions(headlines)
+    items = get_items_from headlines
+    descriptions = []
+    items.each do |item|
+      descriptions << attribute_from(item, :description).gsub("<![CDATA[","").gsub("]]>","")
+    end
+    descriptions
+  end
+  def self.links(headlines)
+    items = get_items_from headlines
+    links = []
+    items.each do |item|
+      links << attribute_from(item, :link)
+    end
+    links
+  end
+  def self.tags(key, pages)
+    Calais.process_document(
+        :content => pages,
+        :content_type => :text,
+        :license_id => key
+    )
+  end
+end

data/lib/jkl/calais_client.rb ADDED Viewed

@@ -0,0 +1,64 @@
+require "json"
+require "rest_client"
+require "calais"
+module Jkl
+  def self.get_from_calais(content)
+    begin
+      license_id = YAML::load_file('config/keys.yml')['calais']
+      c_uri = URI.parse('http://api.opencalais.com/enlighten/rest/')
+      post_args = { 'licenseID' => license_id, 'content' => content,
+                  'paramsXML' => paramsXML('application/json') }
+      post_to(c_uri, post_args)
+    rescue Exception => e
+      puts e
+    end
+  end
+  def self.get_tag_from_json(response)
+    result = JSON.parse response
+    result.delete_if {|key, value| key == "doc" } # ditching the doc
+    cleaned_result = []
+    result.each do |key,tag|
+      tag = Jkl::clean_unwanted_items_from_hash tag
+      cleaned_result << tag
+      yield tag if block_given?
+    end
+    cleaned_result
+  end
+  def self.get_calais_metadata(response)
+     #ce = CalaisExtractor.new( response )
+     #ce.prettify
+     #TODO work out how to implement this
+  end
+  #jkl doesn't work with these aspects of the calais response, also removing blanks
+  def self.clean_unwanted_items_from_hash h
+    h.delete_if {|k, v| k == "relevance" }
+    h.delete_if {|k, v| k == "instances" }
+    h.delete_if {|k, v| v == "N/A"}
+    h.delete_if {|k, v| v == []}
+    h.delete_if {|k, v| v == ""}
+    h.delete_if {|k, v| k == "_typeGroup"}
+    h
+  end
+  private
+  def self.paramsXML(format)
+   <<-paramsXML;
+    <c:params xmlns:c="http://s.opencalais.com/1/pred/"
+           xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
+           <c:processingDirectives
+           c:contentType="text/txt"
+           c:outputFormat="#{format}">
+           </c:processingDirectives>
+           <c:userDirectives />
+           <c:externalMetadata />
+           </c:params>
+    paramsXML
+  end
+end