muddyit_fu 0.2.10 → 0.2.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +86 -38
- data/VERSION +1 -1
- data/lib/muddyit/base.rb +16 -9
- data/lib/muddyit/collections/pages.rb +16 -8
- data/muddyit_fu.gemspec +1 -1
- data/test/test_muddyit_fu.rb +1 -1
- metadata +1 -1
data/README.rdoc
CHANGED
@@ -1,88 +1,136 @@
|
|
1
1
|
= muddyit_fu
|
2
2
|
|
3
|
+
Muddy is an information extraction platform. For further
|
4
|
+
details see the '{Getting Started with Muddy}[http://blog.muddy.it/2009/11/getting-started-with-muddy]'
|
5
|
+
article. This gem provides access to the Muddy platform via it's API :
|
6
|
+
|
7
|
+
{Muddy Developer Guide}[http://muddy.it/developers/]
|
8
|
+
|
3
9
|
== Installation
|
4
10
|
|
5
11
|
sudo gem install gemcutter
|
6
12
|
sudo gem tumble
|
7
13
|
sudo gem install muddyit_fu
|
8
14
|
|
9
|
-
==
|
15
|
+
== Authentication and authorisation
|
16
|
+
|
17
|
+
Muddy supports OAuth and HTTP Basic auth for authentication and authorisation.
|
18
|
+
We recommend you use OAuth wherever possible when accessing Muddy. An example
|
19
|
+
of using OAuth with the muddy platform is descibed in the
|
20
|
+
{Building with Muddy and OAuth}[http://blog.muddy.it/2010/01/building-with-muddy-and-oauth]
|
21
|
+
article.
|
10
22
|
|
11
|
-
|
23
|
+
=== Example muddyit.yml for OAuth
|
12
24
|
|
13
|
-
|
14
|
-
|
25
|
+
---
|
26
|
+
consumer_key: YOUR_CONSUMER_KEY
|
27
|
+
consumer_secret: YOUR_CONSUMER_SECRET
|
28
|
+
access_token: YOUR_ACCESS_TOKEN
|
29
|
+
access_token_secret: YOUR_ACCESS_TOKEN_SECRET
|
15
30
|
|
16
|
-
|
31
|
+
=== Example muddyit.yml for HTTP Basic Auth
|
17
32
|
|
18
33
|
---
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
34
|
+
username: YOUR_USERNAME
|
35
|
+
password: YOUR_PASSWORD
|
36
|
+
|
37
|
+
== Simplest entity extraction example
|
23
38
|
|
24
|
-
|
39
|
+
This example uses the basic 'extract' method to retrieve a list of entities from
|
40
|
+
a piece of source text.
|
25
41
|
|
26
42
|
require 'muddyit_fu'
|
27
|
-
muddyit =
|
28
|
-
muddyit.
|
29
|
-
|
43
|
+
muddyit = Muddyit.new('./config.yml')
|
44
|
+
page = muddyit.extract(ARGV[0])
|
45
|
+
page.entities.each do |entity|
|
46
|
+
puts "\t#{entity.term}, #{entity.uri}, #{entity.classification}"
|
30
47
|
end
|
31
48
|
|
32
|
-
==
|
49
|
+
== Working with web pages instead of text
|
33
50
|
|
34
|
-
|
35
|
-
|
36
|
-
|
51
|
+
Muddy uses an intelligent extraction method to identify the key text on any given
|
52
|
+
web page, meaning that the entities extracted are relevant to the article and don't
|
53
|
+
include spurious results from navigation sidebars or page footers. To work with a
|
54
|
+
URL rather than text, just specify a URL instead :
|
37
55
|
|
38
|
-
|
56
|
+
page = muddyit.extract('http://news.bbc.co.uk/1/hi/northern_ireland/8450854.stm')
|
39
57
|
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
58
|
+
== Storing extraction results in a collection
|
59
|
+
|
60
|
+
Muddy allows you to store the entity extraction results so aggregate operations
|
61
|
+
can be performed over a collection of content (a 'collection' has many analysed 'pages').
|
62
|
+
A basic muddy account provides a single 'collection' where extraction results
|
63
|
+
can be stored.
|
64
|
+
|
65
|
+
To store a page against a collection, the collection must first be found :
|
66
|
+
|
67
|
+
collection = muddyit.collections.find(:all).first
|
44
68
|
|
45
|
-
|
69
|
+
Once a collection has been found, entity extraction results can be stored in it:
|
70
|
+
|
71
|
+
collection.pages.create('http://news.bbc.co.uk/1/hi/uk_politics/8011321.stm', {:minium_confidence => 0.2})
|
72
|
+
|
73
|
+
== Viewing all analysed pages in a collection
|
74
|
+
|
75
|
+
You can iterate through all the analysed pages in a collection, be aware that
|
76
|
+
the Muddy API provides the pages as paginated sets, so it may take some time to
|
77
|
+
page through a complete set of pages in a collection (due to repeated HTTP requests
|
78
|
+
for each new paginated set of results).
|
46
79
|
|
47
80
|
require 'muddyit_fu'
|
48
|
-
muddyit = Muddyit.new(
|
49
|
-
|
50
|
-
:access_token => 'ccc',
|
51
|
-
:access_token_secret => 'ddd')
|
52
|
-
collection = muddyit.collections.first
|
81
|
+
muddyit = Muddyit.new('./config.yml')
|
82
|
+
collection = muddyit.collections.find(:all).first
|
53
83
|
collection.pages.find(:all) do |page|
|
54
84
|
puts page.title
|
55
85
|
page.entities.each do |entity|
|
56
|
-
puts entity.uri
|
86
|
+
puts "\t#{entity.uri}"
|
57
87
|
end
|
58
88
|
end
|
59
89
|
|
60
|
-
==
|
90
|
+
== Working with a collection
|
91
|
+
|
92
|
+
A collection allows aggregate operations to be perfomed on itself and on it's
|
93
|
+
members. A collection is identified by it's 'collection token'. This is an
|
94
|
+
alphanumeric six character string (e.g. 'a0ret4'). A collection can be found if
|
95
|
+
it's token is known :
|
96
|
+
|
97
|
+
collection = muddyit.collections.find('a0ret4')
|
98
|
+
|
99
|
+
=== View all pages containing 'Gordon Brown'
|
100
|
+
|
101
|
+
If we want to find all references to the grounded entity for 'Gordon Brown 'then
|
102
|
+
it can be searched for using it's DBpedia URI :
|
61
103
|
|
62
104
|
require 'muddyit_fu'
|
63
|
-
muddyit = Muddyit.new('
|
64
|
-
collection = muddyit.collections.find(
|
105
|
+
muddyit = Muddyit.new('./config.yml')
|
106
|
+
collection = muddyit.collections.find('a0ret4')
|
65
107
|
collection.pages.find_by_entity('http://dbpedia.org/resource/Gordon_Brown') do |page|
|
66
108
|
puts page.identifier
|
67
109
|
end
|
68
110
|
|
69
|
-
|
111
|
+
=== Find related entities for 'Gordon Brown'
|
112
|
+
|
113
|
+
To find other entities that occur frequently with 'Gordon Brown' in this
|
114
|
+
collection :
|
70
115
|
|
71
116
|
require 'muddyit_fu'
|
72
|
-
muddyit = Muddyit.new('
|
73
|
-
collection = muddyit.
|
117
|
+
muddyit = Muddyit.new('./config.yml')
|
118
|
+
collection = muddyit.collections.find('a0ret4')
|
74
119
|
puts "Related entity\tOccurance
|
75
120
|
collection.entities.find_related('http://dbpedia.org/resource/Gordon_Brown').each do |entry|
|
76
121
|
puts "#{entry[:enity].uri}\t#{entry[:count]}"
|
77
122
|
end
|
78
123
|
|
79
|
-
|
124
|
+
=== Find related content for : http://news.bbc.co.uk/1/hi/uk_politics/7878418.stm
|
125
|
+
|
126
|
+
To find other content in the collection that shares similar entities with the
|
127
|
+
analysed page that has a uri 'http://news.bbc.co.uk/1/hi/uk_politics/7878418.stm' :
|
80
128
|
|
81
129
|
require 'muddyit_fu'
|
82
|
-
muddyit = Muddyit.new('
|
130
|
+
muddyit = Muddyit.new('./config.yml')
|
83
131
|
collection = muddyit.collections.find(:all).first
|
84
132
|
page = collection.pages.find(:all, :uri => 'http://news.bbc.co.uk/1/hi/uk_politics/7878418.stm').first
|
85
|
-
puts "
|
133
|
+
puts "Page : #{page.title}\n\n"
|
86
134
|
page.related_content.each do |results|
|
87
135
|
puts "#{results[:page].title} #{results[:count]}"
|
88
136
|
end
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.2.
|
1
|
+
0.2.11
|
data/lib/muddyit/base.rb
CHANGED
@@ -125,22 +125,29 @@ module Muddyit
|
|
125
125
|
def collections() @collections ||= Muddyit::Collections.new(self) end
|
126
126
|
|
127
127
|
# A mirror of the pages.create method, but for one off, non-stored, quick extraction
|
128
|
-
def extract(doc
|
128
|
+
def extract(doc, options={})
|
129
129
|
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
130
|
+
document = {}
|
131
|
+
if doc.is_a? Hash
|
132
|
+
unless doc[:uri] || doc[:text]
|
133
|
+
raise
|
134
|
+
end
|
135
|
+
document = doc
|
136
|
+
elsif doc.is_a? String
|
137
|
+
if doc =~ /^http:\/\//
|
138
|
+
document[:uri] = doc
|
139
|
+
else
|
140
|
+
document[:text] = doc
|
141
|
+
end
|
136
142
|
end
|
137
143
|
|
138
|
-
|
144
|
+
# Ensure we get content_data as well
|
145
|
+
options[:include_content] = true
|
139
146
|
|
147
|
+
body = { :page => document.merge!(:options => options) }
|
140
148
|
api_url = "/extract"
|
141
149
|
response = self.send_request(api_url, :post, {}, body.to_json)
|
142
150
|
return Muddyit::Collections::Collection::Pages::Page.new(self, response)
|
143
|
-
|
144
151
|
end
|
145
152
|
|
146
153
|
protected
|
@@ -49,18 +49,26 @@ class Muddyit::Collections::Collection::Pages < Muddyit::Generic
|
|
49
49
|
# Params
|
50
50
|
# * options (Required)
|
51
51
|
#
|
52
|
-
def create(doc
|
52
|
+
def create(doc, options = {})
|
53
53
|
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
54
|
+
document = {}
|
55
|
+
if doc.is_a? Hash
|
56
|
+
unless doc[:uri] || doc[:text]
|
57
|
+
raise
|
58
|
+
end
|
59
|
+
document = doc
|
60
|
+
elsif doc.is_a? String
|
61
|
+
if doc =~ /^http:\/\//
|
62
|
+
document[:uri] = doc
|
63
|
+
else
|
64
|
+
document[:text] = doc
|
65
|
+
end
|
60
66
|
end
|
61
67
|
|
62
|
-
|
68
|
+
# Ensure we get content_data as well
|
69
|
+
options[:include_content] = true
|
63
70
|
|
71
|
+
body = { :page => document.merge!(:options => options) }
|
64
72
|
api_url = "/collections/#{self.collection.attributes[:token]}/pages/"
|
65
73
|
response = @muddyit.send_request(api_url, :post, {}, body.to_json)
|
66
74
|
return Muddyit::Collections::Collection::Pages::Page.new(@muddyit, response['page'].merge!(:collection => self.collection))
|
data/muddyit_fu.gemspec
CHANGED
data/test/test_muddyit_fu.rb
CHANGED