RubyGems - muddyit_fu - Versions diffs - 0.2.10 → 0.2.11 - Mend

muddyit_fu 0.2.10 → 0.2.11

Files changed (7) hide show

data/README.rdoc +86 -38
data/VERSION +1 -1
data/lib/muddyit/base.rb +16 -9
data/lib/muddyit/collections/pages.rb +16 -8
data/muddyit_fu.gemspec +1 -1
data/test/test_muddyit_fu.rb +1 -1
metadata +1 -1

data/README.rdoc CHANGED Viewed

@@ -1,88 +1,136 @@
 = muddyit_fu
+Muddy is an information extraction platform.  For further
+details see the '{Getting Started with Muddy}[http://blog.muddy.it/2009/11/getting-started-with-muddy]'
+article.  This gem provides access to the Muddy platform via it's API :
+{Muddy Developer Guide}[http://muddy.it/developers/]
 == Installation
   sudo gem install gemcutter
   sudo gem tumble
   sudo gem install muddyit_fu
-== Getting started
+== Authentication and authorisation
+Muddy supports OAuth and HTTP Basic auth for authentication and authorisation.
+We recommend you use OAuth wherever possible when accessing Muddy.  An example
+of using OAuth with the muddy platform is descibed in the
+{Building with Muddy and OAuth}[http://blog.muddy.it/2010/01/building-with-muddy-and-oauth]
+article.
-muddy.it uses oauth to manage it's api access.
+=== Example muddyit.yml for OAuth
-See http://blog.muddy.it/2009/11/getting-started-with-muddy for details on how to
-use the API.
+  ---
+  consumer_key: YOUR_CONSUMER_KEY
+  consumer_secret: YOUR_CONSUMER_SECRET
+  access_token: YOUR_ACCESS_TOKEN
+  access_token_secret: YOUR_ACCESS_TOKEN_SECRET
-== Example muddyit.yml
+=== Example muddyit.yml for HTTP Basic Auth
   ---
-  consumer_key: "YOUR_CONSUMER_KEY"
-  consumer_secret: "YOUR_CONSUMER_SECRET"
-  access_token: "YOUR_ACCESS_TOKEN"
-  access_token_secret: "YOUR_ACCESS_TOKEN_SECRET"
+  username: YOUR_USERNAME
+  password: YOUR_PASSWORD
+== Simplest entity extraction example
-== Retrieving all collections
+This example uses the basic 'extract' method to retrieve a list of entities from
+a piece of source text.
   require 'muddyit_fu'
-  muddyit = Muddyit.new('muddyit.yml')
-  muddyit.collections.find(:all).each do |collection|
-    puts "#{collection.label} : #{collection.token}"
+  muddyit =  Muddyit.new('./config.yml')
+  page = muddyit.extract(ARGV[0])
+  page.entities.each do |entity|
+    puts "\t#{entity.term}, #{entity.uri}, #{entity.classification}"
   end
-== Retrieving a single collection
+== Working with web pages instead of text
-  require 'muddyit_fu'
-  muddyit = Muddyit.new('muddyit.yml')
-  puts muddyit.collections.find('a0ret4').label
+Muddy uses an intelligent extraction method to identify the key text on any given
+web page, meaning that the entities extracted are relevant to the article and don't
+include spurious results from navigation sidebars or page footers.  To work with a
+URL rather than text, just specify a URL instead :
-== Categorisation request
+  page = muddyit.extract('http://news.bbc.co.uk/1/hi/northern_ireland/8450854.stm')
-  require 'muddyit_fu'
-  muddyit = Muddyit.new('muddyit.yml')
-  collection = muddyit.collections.first
-  collection.pages.create({:uri => 'http://news.bbc.co.uk/1/hi/uk_politics/8011321.stm'}, {:minium_confidence => 0.2})
+== Storing extraction results in a collection
+Muddy allows you to store the entity extraction results so aggregate operations
+can be performed over a collection of content (a 'collection' has many analysed 'pages').
+A basic muddy account provides a single 'collection' where extraction results
+can be stored.
+To store a page against a collection, the collection must first be found :
+  collection = muddyit.collections.find(:all).first
-== View categorised pages
+Once a collection has been found, entity extraction results can be stored in it:
+  collection.pages.create('http://news.bbc.co.uk/1/hi/uk_politics/8011321.stm', {:minium_confidence => 0.2})
+== Viewing all analysed pages in a collection
+You can iterate through all the analysed pages in a collection, be aware that
+the Muddy API provides the pages as paginated sets, so it may take some time to
+page through a complete set of pages in a collection (due to repeated HTTP requests
+for each new paginated set of results).
   require 'muddyit_fu'
-  muddyit =  Muddyit.new(:consumer_key => 'aaa',
-                         :consumer_secret => 'bbb',
-                         :access_token => 'ccc',
-                         :access_token_secret => 'ddd')
-  collection = muddyit.collections.first
+  muddyit =  Muddyit.new('./config.yml')
+  collection = muddyit.collections.find(:all).first
   collection.pages.find(:all) do |page|
     puts page.title
     page.entities.each do |entity|
-      puts entity.uri
+      puts "\t#{entity.uri}"
     end
   end
-== View all pages containing 'Gordon Brown'
+== Working with a collection
+A collection allows aggregate operations to be perfomed on itself and on it's
+members.  A collection is identified by it's 'collection token'.  This is an
+alphanumeric six character string (e.g. 'a0ret4').  A collection can be found if
+it's token is known :
+  collection = muddyit.collections.find('a0ret4')
+=== View all pages containing 'Gordon Brown'
+If we want to find all references to the grounded entity for 'Gordon Brown 'then
+it can be searched for using it's DBpedia URI :
   require 'muddyit_fu'
-  muddyit = Muddyit.new('muddyit.yml')
-  collection = muddyit.collections.find(:all).first
+  muddyit = Muddyit.new('./config.yml')
+  collection = muddyit.collections.find('a0ret4')
   collection.pages.find_by_entity('http://dbpedia.org/resource/Gordon_Brown') do |page|
     puts page.identifier
   end
-== Find related entities for 'Gordon Brown'
+=== Find related entities for 'Gordon Brown'
+To find other entities that occur frequently with 'Gordon Brown' in this
+collection :
   require 'muddyit_fu'
-  muddyit = Muddyit.new('muddyit.yml')
-  collection = muddyit.collcetions.find(:all).first
+  muddyit = Muddyit.new('./config.yml')
+  collection = muddyit.collections.find('a0ret4')
   puts "Related entity\tOccurance
   collection.entities.find_related('http://dbpedia.org/resource/Gordon_Brown').each do |entry|
     puts "#{entry[:enity].uri}\t#{entry[:count]}"
   end
-== Find related content for : http://news.bbc.co.uk/1/hi/uk_politics/7878418.stm
+=== Find related content for : http://news.bbc.co.uk/1/hi/uk_politics/7878418.stm
+To find other content in the collection that shares similar entities with the
+analysed page that has a uri 'http://news.bbc.co.uk/1/hi/uk_politics/7878418.stm' :
   require 'muddyit_fu'
-  muddyit = Muddyit.new('muddyit.yml')
+  muddyit = Muddyit.new('./config.yml')
   collection = muddyit.collections.find(:all).first
   page = collection.pages.find(:all, :uri => 'http://news.bbc.co.uk/1/hi/uk_politics/7878418.stm').first
-  puts "Our page : #{page.title}\n\n"
+  puts "Page : #{page.title}\n\n"
   page.related_content.each do |results|
     puts "#{results[:page].title} #{results[:count]}"
   end

data/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 0.2.10
1	+ 0.2.11

data/lib/muddyit/base.rb CHANGED Viewed

@@ -125,22 +125,29 @@ module Muddyit
     def collections() @collections ||= Muddyit::Collections.new(self) end
     # A mirror of the pages.create method, but for one off, non-stored, quick extraction
-    def extract(doc={}, options={})
+    def extract(doc, options={})
-      # Ensure we get content_data as well
-      options[:include_content] = true
-      # Ensure we have encoded the identifier and URI
-      unless doc[:uri] || doc[:text]
-        raise
+      document = {}
+      if doc.is_a? Hash
+        unless doc[:uri] || doc[:text]
+          raise
+        end
+        document = doc
+      elsif doc.is_a? String
+        if doc =~ /^http:\/\//
+          document[:uri] = doc
+        else
+          document[:text] = doc
+        end
       end
-      body = { :page => doc.merge!(:options => options) }
+      # Ensure we get content_data as well
+      options[:include_content] = true
+      body = { :page => document.merge!(:options => options) }
       api_url = "/extract"
       response = self.send_request(api_url, :post, {}, body.to_json)
       return Muddyit::Collections::Collection::Pages::Page.new(self, response)
     end
     protected

data/lib/muddyit/collections/pages.rb CHANGED Viewed

@@ -49,18 +49,26 @@ class Muddyit::Collections::Collection::Pages < Muddyit::Generic
   # Params
   # * options (Required)
   #
-  def create(doc = {}, options = {})
+  def create(doc, options = {})
-    # Ensure we get content_data as well
-    options[:include_content] = true
-    # Ensure we have encoded the identifier and URI
-    unless doc[:uri] || doc[:text]
-      raise
+    document = {}
+    if doc.is_a? Hash
+      unless doc[:uri] || doc[:text]
+        raise
+      end
+      document = doc
+    elsif doc.is_a? String
+      if doc =~ /^http:\/\//
+        document[:uri] = doc
+      else
+        document[:text] = doc
+      end
     end
-    body = { :page => doc.merge!(:options => options) }
+    # Ensure we get content_data as well
+    options[:include_content] = true
+    body = { :page => document.merge!(:options => options) }
     api_url = "/collections/#{self.collection.attributes[:token]}/pages/"
     response = @muddyit.send_request(api_url, :post, {}, body.to_json)
     return Muddyit::Collections::Collection::Pages::Page.new(@muddyit, response['page'].merge!(:collection => self.collection))

data/muddyit_fu.gemspec CHANGED Viewed

@@ -2,7 +2,7 @@
 Gem::Specification.new do |s|
   s.name = %q{muddyit_fu}
-  s.version = "0.2.10"
+  s.version = "0.2.11"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["rattle"]

data/test/test_muddyit_fu.rb CHANGED Viewed

@@ -24,7 +24,7 @@ class TestMuddyitFu < Test::Unit::TestCase
     end
     should "analyse a page without a collection" do
-      page = @muddyit.extract({:uri => @@STORY})
+      page = @muddyit.extract(@@STORY)
       assert page.entities.length > 0
     end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: muddyit_fu
 version: !ruby/object:Gem::Version
-  version: 0.2.10
+  version: 0.2.11
 platform: ruby
 authors:
 - rattle