muddyit_fu 0.2.10 → 0.2.11
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +86 -38
- data/VERSION +1 -1
- data/lib/muddyit/base.rb +16 -9
- data/lib/muddyit/collections/pages.rb +16 -8
- data/muddyit_fu.gemspec +1 -1
- data/test/test_muddyit_fu.rb +1 -1
- metadata +1 -1
data/README.rdoc
CHANGED
@@ -1,88 +1,136 @@
|
|
1
1
|
= muddyit_fu
|
2
2
|
|
3
|
+
Muddy is an information extraction platform. For further
|
4
|
+
details see the '{Getting Started with Muddy}[http://blog.muddy.it/2009/11/getting-started-with-muddy]'
|
5
|
+
article. This gem provides access to the Muddy platform via it's API :
|
6
|
+
|
7
|
+
{Muddy Developer Guide}[http://muddy.it/developers/]
|
8
|
+
|
3
9
|
== Installation
|
4
10
|
|
5
11
|
sudo gem install gemcutter
|
6
12
|
sudo gem tumble
|
7
13
|
sudo gem install muddyit_fu
|
8
14
|
|
9
|
-
==
|
15
|
+
== Authentication and authorisation
|
16
|
+
|
17
|
+
Muddy supports OAuth and HTTP Basic auth for authentication and authorisation.
|
18
|
+
We recommend you use OAuth wherever possible when accessing Muddy. An example
|
19
|
+
of using OAuth with the muddy platform is descibed in the
|
20
|
+
{Building with Muddy and OAuth}[http://blog.muddy.it/2010/01/building-with-muddy-and-oauth]
|
21
|
+
article.
|
10
22
|
|
11
|
-
|
23
|
+
=== Example muddyit.yml for OAuth
|
12
24
|
|
13
|
-
|
14
|
-
|
25
|
+
---
|
26
|
+
consumer_key: YOUR_CONSUMER_KEY
|
27
|
+
consumer_secret: YOUR_CONSUMER_SECRET
|
28
|
+
access_token: YOUR_ACCESS_TOKEN
|
29
|
+
access_token_secret: YOUR_ACCESS_TOKEN_SECRET
|
15
30
|
|
16
|
-
|
31
|
+
=== Example muddyit.yml for HTTP Basic Auth
|
17
32
|
|
18
33
|
---
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
34
|
+
username: YOUR_USERNAME
|
35
|
+
password: YOUR_PASSWORD
|
36
|
+
|
37
|
+
== Simplest entity extraction example
|
23
38
|
|
24
|
-
|
39
|
+
This example uses the basic 'extract' method to retrieve a list of entities from
|
40
|
+
a piece of source text.
|
25
41
|
|
26
42
|
require 'muddyit_fu'
|
27
|
-
muddyit =
|
28
|
-
muddyit.
|
29
|
-
|
43
|
+
muddyit = Muddyit.new('./config.yml')
|
44
|
+
page = muddyit.extract(ARGV[0])
|
45
|
+
page.entities.each do |entity|
|
46
|
+
puts "\t#{entity.term}, #{entity.uri}, #{entity.classification}"
|
30
47
|
end
|
31
48
|
|
32
|
-
==
|
49
|
+
== Working with web pages instead of text
|
33
50
|
|
34
|
-
|
35
|
-
|
36
|
-
|
51
|
+
Muddy uses an intelligent extraction method to identify the key text on any given
|
52
|
+
web page, meaning that the entities extracted are relevant to the article and don't
|
53
|
+
include spurious results from navigation sidebars or page footers. To work with a
|
54
|
+
URL rather than text, just specify a URL instead :
|
37
55
|
|
38
|
-
|
56
|
+
page = muddyit.extract('http://news.bbc.co.uk/1/hi/northern_ireland/8450854.stm')
|
39
57
|
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
58
|
+
== Storing extraction results in a collection
|
59
|
+
|
60
|
+
Muddy allows you to store the entity extraction results so aggregate operations
|
61
|
+
can be performed over a collection of content (a 'collection' has many analysed 'pages').
|
62
|
+
A basic muddy account provides a single 'collection' where extraction results
|
63
|
+
can be stored.
|
64
|
+
|
65
|
+
To store a page against a collection, the collection must first be found :
|
66
|
+
|
67
|
+
collection = muddyit.collections.find(:all).first
|
44
68
|
|
45
|
-
|
69
|
+
Once a collection has been found, entity extraction results can be stored in it:
|
70
|
+
|
71
|
+
collection.pages.create('http://news.bbc.co.uk/1/hi/uk_politics/8011321.stm', {:minium_confidence => 0.2})
|
72
|
+
|
73
|
+
== Viewing all analysed pages in a collection
|
74
|
+
|
75
|
+
You can iterate through all the analysed pages in a collection, be aware that
|
76
|
+
the Muddy API provides the pages as paginated sets, so it may take some time to
|
77
|
+
page through a complete set of pages in a collection (due to repeated HTTP requests
|
78
|
+
for each new paginated set of results).
|
46
79
|
|
47
80
|
require 'muddyit_fu'
|
48
|
-
muddyit = Muddyit.new(
|
49
|
-
|
50
|
-
:access_token => 'ccc',
|
51
|
-
:access_token_secret => 'ddd')
|
52
|
-
collection = muddyit.collections.first
|
81
|
+
muddyit = Muddyit.new('./config.yml')
|
82
|
+
collection = muddyit.collections.find(:all).first
|
53
83
|
collection.pages.find(:all) do |page|
|
54
84
|
puts page.title
|
55
85
|
page.entities.each do |entity|
|
56
|
-
puts entity.uri
|
86
|
+
puts "\t#{entity.uri}"
|
57
87
|
end
|
58
88
|
end
|
59
89
|
|
60
|
-
==
|
90
|
+
== Working with a collection
|
91
|
+
|
92
|
+
A collection allows aggregate operations to be perfomed on itself and on it's
|
93
|
+
members. A collection is identified by it's 'collection token'. This is an
|
94
|
+
alphanumeric six character string (e.g. 'a0ret4'). A collection can be found if
|
95
|
+
it's token is known :
|
96
|
+
|
97
|
+
collection = muddyit.collections.find('a0ret4')
|
98
|
+
|
99
|
+
=== View all pages containing 'Gordon Brown'
|
100
|
+
|
101
|
+
If we want to find all references to the grounded entity for 'Gordon Brown 'then
|
102
|
+
it can be searched for using it's DBpedia URI :
|
61
103
|
|
62
104
|
require 'muddyit_fu'
|
63
|
-
muddyit = Muddyit.new('
|
64
|
-
collection = muddyit.collections.find(
|
105
|
+
muddyit = Muddyit.new('./config.yml')
|
106
|
+
collection = muddyit.collections.find('a0ret4')
|
65
107
|
collection.pages.find_by_entity('http://dbpedia.org/resource/Gordon_Brown') do |page|
|
66
108
|
puts page.identifier
|
67
109
|
end
|
68
110
|
|
69
|
-
|
111
|
+
=== Find related entities for 'Gordon Brown'
|
112
|
+
|
113
|
+
To find other entities that occur frequently with 'Gordon Brown' in this
|
114
|
+
collection :
|
70
115
|
|
71
116
|
require 'muddyit_fu'
|
72
|
-
muddyit = Muddyit.new('
|
73
|
-
collection = muddyit.
|
117
|
+
muddyit = Muddyit.new('./config.yml')
|
118
|
+
collection = muddyit.collections.find('a0ret4')
|
74
119
|
puts "Related entity\tOccurance
|
75
120
|
collection.entities.find_related('http://dbpedia.org/resource/Gordon_Brown').each do |entry|
|
76
121
|
puts "#{entry[:enity].uri}\t#{entry[:count]}"
|
77
122
|
end
|
78
123
|
|
79
|
-
|
124
|
+
=== Find related content for : http://news.bbc.co.uk/1/hi/uk_politics/7878418.stm
|
125
|
+
|
126
|
+
To find other content in the collection that shares similar entities with the
|
127
|
+
analysed page that has a uri 'http://news.bbc.co.uk/1/hi/uk_politics/7878418.stm' :
|
80
128
|
|
81
129
|
require 'muddyit_fu'
|
82
|
-
muddyit = Muddyit.new('
|
130
|
+
muddyit = Muddyit.new('./config.yml')
|
83
131
|
collection = muddyit.collections.find(:all).first
|
84
132
|
page = collection.pages.find(:all, :uri => 'http://news.bbc.co.uk/1/hi/uk_politics/7878418.stm').first
|
85
|
-
puts "
|
133
|
+
puts "Page : #{page.title}\n\n"
|
86
134
|
page.related_content.each do |results|
|
87
135
|
puts "#{results[:page].title} #{results[:count]}"
|
88
136
|
end
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.2.
|
1
|
+
0.2.11
|
data/lib/muddyit/base.rb
CHANGED
@@ -125,22 +125,29 @@ module Muddyit
|
|
125
125
|
def collections() @collections ||= Muddyit::Collections.new(self) end
|
126
126
|
|
127
127
|
# A mirror of the pages.create method, but for one off, non-stored, quick extraction
|
128
|
-
def extract(doc
|
128
|
+
def extract(doc, options={})
|
129
129
|
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
130
|
+
document = {}
|
131
|
+
if doc.is_a? Hash
|
132
|
+
unless doc[:uri] || doc[:text]
|
133
|
+
raise
|
134
|
+
end
|
135
|
+
document = doc
|
136
|
+
elsif doc.is_a? String
|
137
|
+
if doc =~ /^http:\/\//
|
138
|
+
document[:uri] = doc
|
139
|
+
else
|
140
|
+
document[:text] = doc
|
141
|
+
end
|
136
142
|
end
|
137
143
|
|
138
|
-
|
144
|
+
# Ensure we get content_data as well
|
145
|
+
options[:include_content] = true
|
139
146
|
|
147
|
+
body = { :page => document.merge!(:options => options) }
|
140
148
|
api_url = "/extract"
|
141
149
|
response = self.send_request(api_url, :post, {}, body.to_json)
|
142
150
|
return Muddyit::Collections::Collection::Pages::Page.new(self, response)
|
143
|
-
|
144
151
|
end
|
145
152
|
|
146
153
|
protected
|
@@ -49,18 +49,26 @@ class Muddyit::Collections::Collection::Pages < Muddyit::Generic
|
|
49
49
|
# Params
|
50
50
|
# * options (Required)
|
51
51
|
#
|
52
|
-
def create(doc
|
52
|
+
def create(doc, options = {})
|
53
53
|
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
54
|
+
document = {}
|
55
|
+
if doc.is_a? Hash
|
56
|
+
unless doc[:uri] || doc[:text]
|
57
|
+
raise
|
58
|
+
end
|
59
|
+
document = doc
|
60
|
+
elsif doc.is_a? String
|
61
|
+
if doc =~ /^http:\/\//
|
62
|
+
document[:uri] = doc
|
63
|
+
else
|
64
|
+
document[:text] = doc
|
65
|
+
end
|
60
66
|
end
|
61
67
|
|
62
|
-
|
68
|
+
# Ensure we get content_data as well
|
69
|
+
options[:include_content] = true
|
63
70
|
|
71
|
+
body = { :page => document.merge!(:options => options) }
|
64
72
|
api_url = "/collections/#{self.collection.attributes[:token]}/pages/"
|
65
73
|
response = @muddyit.send_request(api_url, :post, {}, body.to_json)
|
66
74
|
return Muddyit::Collections::Collection::Pages::Page.new(@muddyit, response['page'].merge!(:collection => self.collection))
|
data/muddyit_fu.gemspec
CHANGED
data/test/test_muddyit_fu.rb
CHANGED