activity_mapper 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.textile +211 -0
- data/lib/activity_mapper.rb +35 -0
- data/lib/activity_mapper/activity_data_mapper.rb +93 -0
- data/lib/activity_mapper/connector.rb +71 -0
- data/lib/activity_mapper/lexicon.txt +92662 -0
- data/lib/activity_mapper/linguistics.rb +147 -0
- data/lib/activity_mapper/service_module.rb +194 -0
- data/lib/activity_mapper/service_modules.rb +4 -0
- data/lib/activity_mapper/service_modules/delicious.rb +41 -0
- data/lib/activity_mapper/service_modules/flickr.rb +59 -0
- data/lib/activity_mapper/service_modules/twitter.rb +57 -0
- data/lib/activity_mapper/service_modules/wakoopa.rb +350 -0
- data/lib/activity_mapper/service_modules/youtube.rb +63 -0
- data/lib/extensions/uri.rb +42 -0
- data/spec/connector_mock.rb +49 -0
- data/spec/data/delicious_ac8fdf9b4e304b150bf198b42a1cb1b4.json +1 -0
- data/spec/data/delicious_ee55656e0f69242ccf02c3eb0f97b296.json +1 -0
- data/spec/data/flickr_2b826afe2906894197d92f7a41c2785c. +1 -0
- data/spec/data/flickr_3c25cf51d174ee4bb2d8673e294ce4c0.json +196 -0
- data/spec/data/flickr_482967b550afd05993ec4256fa1de388.json +229 -0
- data/spec/data/flickr_54c4b36bea4e2b14c783e5c50cba8544.json +9 -0
- data/spec/data/flickr_848ea91c1a903d0347d1029bf863132a. +1 -0
- data/spec/data/flickr_bcb82142f3d6998cabab3c82fba15ced.json +141 -0
- data/spec/data/flickr_e9d78058938c7c845a8de3a2c3526c61. +1 -0
- data/spec/data/twitter_227e1ca72a2ba08a1b03be2cd64b681e.json +274 -0
- data/spec/data/twitter_5dd0884137bf193d55d446619ec65c7e.json +1 -0
- data/spec/data/twitter_dd694fe9ebda58a84f92a859fb1cb79c.json +1 -0
- data/spec/data/wakoopa_1a8b94baf4eed4fd0e7ac39dc2f53059.json +1 -0
- data/spec/data/youtube_050e71a038359cc8ff7e00161e687b4e.json +1 -0
- data/spec/data/youtube_6d895fb0245f47b9e4fcc868d1f91674.json +1 -0
- data/spec/data/youtube_a92042581e44f9d3da548e2c7a89849c.json +1 -0
- data/spec/data/youtube_bf7b648ff8720edb1db951b7b9a474c4.json +1 -0
- data/spec/data/zemanta_suggest_for_tweet_response.xml +405 -0
- data/spec/models.rb +114 -0
- data/spec/service_modules/delicious_spec.rb +32 -0
- data/spec/service_modules/flickr_spec.rb +43 -0
- data/spec/service_modules/twitter_spec.rb +44 -0
- data/spec/service_modules/wakoopa_spec.rb +35 -0
- data/spec/service_modules/youtube_spec.rb +47 -0
- data/spec/spec_helper.rb +8 -0
- metadata +130 -0
@@ -0,0 +1,147 @@
|
|
1
|
+
# tagger.rb - a Ruby Part of Speech Tagger patterned on the work of Eric Brill. Version 0.1
|
2
|
+
#
|
3
|
+
# Copyright 2005 Mark Watson. All rights reserved.
|
4
|
+
# This software is released under the LGPL
|
5
|
+
#
|
6
|
+
# Contributor: Pat Eyler
|
7
|
+
#
|
8
|
+
|
9
|
+
module ActivityMapper
|
10
|
+
|
11
|
+
module Linguistics
|
12
|
+
|
13
|
+
class Tagger
|
14
|
+
|
15
|
+
UNINTERESTING_PARTS_OF_SPEECH = ['DT', 'PRP', 'IN', 'CC', 'MD']
|
16
|
+
|
17
|
+
def initialize()
|
18
|
+
@lexicon = {}
|
19
|
+
file = File.new(File.join(File.dirname(__FILE__), 'lexicon.txt'), 'r')
|
20
|
+
file.each_line {|line|
|
21
|
+
toks=line.split
|
22
|
+
@lexicon[toks.shift]=toks
|
23
|
+
}
|
24
|
+
file.close
|
25
|
+
end
|
26
|
+
|
27
|
+
def tokenize(words)
|
28
|
+
words.split(/ |,|\.|\:|\;|\'/) #'
|
29
|
+
end
|
30
|
+
|
31
|
+
def self.keywords_for_caption(caption)
|
32
|
+
@@tagger ||= self.new
|
33
|
+
keywords = []
|
34
|
+
all_keywords = @@tagger.tokenize(caption)
|
35
|
+
pos_tags = @@tagger.part_of_speech_tag(all_keywords)
|
36
|
+
all_keywords.each_with_index do |keyword,i|
|
37
|
+
next if UNINTERESTING_PARTS_OF_SPEECH.include?(pos_tags[i])
|
38
|
+
next unless keyword.size > 4
|
39
|
+
keywords << keyword.downcase
|
40
|
+
end
|
41
|
+
keywords
|
42
|
+
rescue => e
|
43
|
+
all_keywords
|
44
|
+
end
|
45
|
+
|
46
|
+
def part_of_speech_tag(text)
|
47
|
+
|
48
|
+
## start by tokenizing strings passed in
|
49
|
+
if text.class == String then
|
50
|
+
text = tokenize(text)
|
51
|
+
end
|
52
|
+
|
53
|
+
## we only work on arrays. If text isn't an array,
|
54
|
+
## quit now.
|
55
|
+
if text.class != Array then
|
56
|
+
raise RuntimeError, "can't tokenize #{text.class}"
|
57
|
+
end
|
58
|
+
|
59
|
+
# this looks like an artifact of testing
|
60
|
+
# puts "text:",text,"\n"
|
61
|
+
ret = []
|
62
|
+
|
63
|
+
text.each do
|
64
|
+
|w| ret << (@lexicon[w] && @lexicon[w][0]) ||
|
65
|
+
(@lexicon[w.downcase] && words[w.downcase][0]) ||
|
66
|
+
'NN'
|
67
|
+
end
|
68
|
+
|
69
|
+
## Now, apply transformational rules:
|
70
|
+
text.length.times do |i|
|
71
|
+
|
72
|
+
## rule 1: DT, {VBD | VBP} --> DT, NN
|
73
|
+
if i > 0 then
|
74
|
+
if ret[i - 1] == "DT" then
|
75
|
+
if ret[i] == "VBD" or ret[i] == "VBP" or ret[i] == "VB" then
|
76
|
+
ret[i] = "NN"
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
## rule 2: convert a noun to a number (CD) if "." appears in the word
|
82
|
+
if ret[i] =~ /^N/ then
|
83
|
+
if text[i] =~ /\./ then
|
84
|
+
ret[i] = "CD"
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
## rule 3: convert a noun to a past participle if words[i] ends
|
89
|
+
## with "ed"
|
90
|
+
if ret[i] =~ /^N/ && text[i] =~ /ed$/ then
|
91
|
+
ret[i] = "VBN"
|
92
|
+
end
|
93
|
+
|
94
|
+
## rule 4: convert any type to adverb if it ends in "ly"
|
95
|
+
if text[i] =~ /ly$/ then
|
96
|
+
ret[i] = "RB"
|
97
|
+
end
|
98
|
+
|
99
|
+
## rule 5: convert a common noun (NN or NNS) to a adjective if
|
100
|
+
## it ends with "al"
|
101
|
+
if ret[i] =~ /^NN/ && text[i] =~ /al$/ then
|
102
|
+
ret[i] = "JJ"
|
103
|
+
end
|
104
|
+
|
105
|
+
## rule 6: convert a noun to a verb if the preceeding work is "would"
|
106
|
+
if i > 0 then
|
107
|
+
if ret[i] =~ /^NN/ then
|
108
|
+
if text[i-1].downcase == "would" then
|
109
|
+
ret[i] = "VB"
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
## rule 7: if a word has been categorized as a common noun and
|
115
|
+
## it ends with "s", then set its type to plural common noun (NNS)
|
116
|
+
if ret[i] == "NN" && text[i] =~ /s$/ then
|
117
|
+
ret[i] = "NNS"
|
118
|
+
end
|
119
|
+
|
120
|
+
## rule 8: convert a common noun to a present participle
|
121
|
+
## verb (i.e., a gerand)
|
122
|
+
if ret[i] =~ /^NN/ && text[i] =~ /ing$/ then
|
123
|
+
ret[i] = "VBG"
|
124
|
+
end
|
125
|
+
|
126
|
+
## rule 9: <noun> <noun 2> --> <noun> <verb> if <noun 2>
|
127
|
+
## can also be a verb
|
128
|
+
if i > 0 then
|
129
|
+
if ( ( ret[i-1] =~ /^NN/ ) && ( ret[i] =~ /^NN/ ) ) then
|
130
|
+
if @lexicon[text[i]].include?("VBN") then
|
131
|
+
ret[i] = "VBN"
|
132
|
+
end
|
133
|
+
if @lexicon[text[i]].include?("VBZ") then
|
134
|
+
ret[i] = "VBZ"
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
end
|
140
|
+
return ret
|
141
|
+
end # def getTag
|
142
|
+
|
143
|
+
end # class Tagger
|
144
|
+
|
145
|
+
end
|
146
|
+
|
147
|
+
end
|
@@ -0,0 +1,194 @@
|
|
1
|
+
|
2
|
+
module ActivityMapper
|
3
|
+
|
4
|
+
class ServiceModule
|
5
|
+
|
6
|
+
COMMON_DIRECTIVES = ['user', 'users', 'person', 'people', 'traveller', 'in', 'profile', 'profiles']
|
7
|
+
|
8
|
+
def initialize(profile)
|
9
|
+
@profile = profile
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.all_accepted_hosts
|
13
|
+
host_expressions = []
|
14
|
+
self.subclasses.each do |service_module_klass|
|
15
|
+
host_expressions << eval("#{service_module_klass.to_s}::ACCEPTED_HOSTS")
|
16
|
+
end
|
17
|
+
host_expressions.flatten!
|
18
|
+
end
|
19
|
+
|
20
|
+
def self.klass_for(url)
|
21
|
+
self.subclasses.each do |service_module_klass|
|
22
|
+
if service_module_klass.accepts?(url)
|
23
|
+
return service_module_klass
|
24
|
+
end
|
25
|
+
end
|
26
|
+
nil
|
27
|
+
end
|
28
|
+
|
29
|
+
# http://groups.google.com/group/comp.lang.ruby/browse_thread/thread/e812c7cef446a96?pli=1
|
30
|
+
def self.subclasses
|
31
|
+
class_hash = {}
|
32
|
+
ObjectSpace.each_object do |obj|
|
33
|
+
if Class == obj.class
|
34
|
+
if obj.ancestors.include? self
|
35
|
+
class_hash[obj] = true
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
class_hash.keys.reject { |ch| ch == self }
|
40
|
+
end
|
41
|
+
|
42
|
+
# Based on a profile url, decide if I need to be responsible for this
|
43
|
+
def self.accepts?(url)
|
44
|
+
u = uri(url)
|
45
|
+
return false unless u
|
46
|
+
|
47
|
+
self::ACCEPTED_HOSTS.each do |host_re|
|
48
|
+
if (u.host =~ host_re)
|
49
|
+
return true
|
50
|
+
end
|
51
|
+
end
|
52
|
+
false
|
53
|
+
end
|
54
|
+
|
55
|
+
def self.detect_username(profiles)
|
56
|
+
usernames = {}
|
57
|
+
profiles.each do |me_url|
|
58
|
+
username = username_from_url(me_url)
|
59
|
+
if username
|
60
|
+
usernames[username] ||= 0
|
61
|
+
usernames[username] += 1
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
usernames = usernames.to_a
|
66
|
+
return [] if usernames.blank?
|
67
|
+
|
68
|
+
usernames.sort! { |b,a| a.last <=> b.last }
|
69
|
+
|
70
|
+
# Factor in the shortest username if top 2 matches have the same score
|
71
|
+
top_score = usernames[0].last
|
72
|
+
if (usernames[1] && usernames[1].last == top_score)
|
73
|
+
usernames = [usernames[0], usernames[1]]
|
74
|
+
usernames.sort! { |a,b| a.first.size <=> b.first.size }
|
75
|
+
end
|
76
|
+
|
77
|
+
usernames.collect(&:first)
|
78
|
+
end
|
79
|
+
|
80
|
+
# -- Implementables
|
81
|
+
|
82
|
+
# Update the long term data (eg top used software, biography)
|
83
|
+
def create_or_update_summary!(options = {}); raise "Method not implemented"; end
|
84
|
+
|
85
|
+
# This is called as often as possible, this is to passively aggregate the latest activity
|
86
|
+
def aggregate_activity!(options = {}); raise "Method not implemented"; end
|
87
|
+
|
88
|
+
# -- Optional Implementables
|
89
|
+
|
90
|
+
# Analyze an individual activity object (eg photo, bookmark, book, software, slide...)
|
91
|
+
|
92
|
+
def shallow_analysis_on(activity_object)
|
93
|
+
return unless activity_object.shallowly_analyzed_at.blank?
|
94
|
+
|
95
|
+
# Extract social connections from body
|
96
|
+
#unless activity_object.body.blank?
|
97
|
+
# SocialConnection.generate_from_nanoformats(@profile, activity_object.body)
|
98
|
+
#end
|
99
|
+
|
100
|
+
# Extract tags from title
|
101
|
+
unless activity_object.title.blank?
|
102
|
+
tags = Linguistics::Tagger.keywords_for_caption(activity_object.title)
|
103
|
+
activity_object.tag_list = (activity_object.tag_list || []) + tags
|
104
|
+
activity_object.shallowly_analyzed_at = Time.now
|
105
|
+
activity_object.save
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
def deep_analysis_on(activity_object)
|
110
|
+
return unless activity_object.deeply_analyzed_at.blank?
|
111
|
+
return unless activity_object.body.size > 100
|
112
|
+
unless activity_object.body.blank?
|
113
|
+
@extractor = ZemantaExtractor.new
|
114
|
+
tags = @extractor.extract_tags(activity_object.body)
|
115
|
+
activity_object.tag_list = activity_object.tag_list + tags
|
116
|
+
activity_object.save
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
protected
|
121
|
+
|
122
|
+
def self.username_from_url(url)
|
123
|
+
# Check for http://:host/COMMON_DIRECTIVES/:username
|
124
|
+
username_after_directive = url.match(/^http\:\/\/([^\/]+)\/([^\/]+)\/([^\/]+)/)
|
125
|
+
if username_after_directive &&
|
126
|
+
username_after_directive[3] &&
|
127
|
+
COMMON_DIRECTIVES.include?(username_after_directive[2])
|
128
|
+
return username_after_directive[3]
|
129
|
+
end
|
130
|
+
|
131
|
+
# Check for http://:host/COMMON_DIRECTIVES?:var=:username
|
132
|
+
username_after_directive = url.match(/^http\:\/\/([^\/]+)\/([^\/]+)\?\w+\=([^\/]+)/)
|
133
|
+
if username_after_directive &&
|
134
|
+
username_after_directive[3] &&
|
135
|
+
COMMON_DIRECTIVES.include?(username_after_directive[2])
|
136
|
+
return username_after_directive[3]
|
137
|
+
end
|
138
|
+
|
139
|
+
# Check for http://:host/username
|
140
|
+
username_ending = url.match(/^http\:\/\/([^\/]+)\/([^\/]+)\/*$/)
|
141
|
+
if username_ending
|
142
|
+
return username_ending[2]
|
143
|
+
end
|
144
|
+
|
145
|
+
nil
|
146
|
+
end
|
147
|
+
|
148
|
+
def self.uri(url)
|
149
|
+
URI.parse(url)
|
150
|
+
rescue => e
|
151
|
+
nil
|
152
|
+
end
|
153
|
+
|
154
|
+
def create_activity(mapped_entry, activity_object_type, activity_verb, additional_activity_parameters = {}, &block)
|
155
|
+
@object_names ||= mapped_entry.keys.collect { |oa| oa.split('.').first }
|
156
|
+
|
157
|
+
return unless mapped_entry['activity_object.url'] # Need proper warning here
|
158
|
+
|
159
|
+
# Create entity pool
|
160
|
+
activity = Activity.create(additional_activity_parameters.merge(
|
161
|
+
:user_id => @profile.user_id,
|
162
|
+
:verb => activity_verb
|
163
|
+
))
|
164
|
+
content_identifier = ActivityObject.content_identifier(mapped_entry['activity_object.url'])
|
165
|
+
activity_object = ActivityObject.fetch(content_identifier, activity_object_type.id)
|
166
|
+
activity_object ||= ActivityObject.create(:activity_object_type_id => activity_object_type.id)
|
167
|
+
media = @object_names.include?('media') ? Media.create : nil
|
168
|
+
rating_summary = @object_names.include?('rating_summary') ? RatingSummary.create : nil
|
169
|
+
|
170
|
+
# Auto-populate attributes
|
171
|
+
mapped_entry.each do |destination, value|
|
172
|
+
eval("#{destination} = value")
|
173
|
+
end
|
174
|
+
|
175
|
+
if block
|
176
|
+
block.call(activity, activity_object)
|
177
|
+
end
|
178
|
+
|
179
|
+
shallow_analysis_on(activity_object)
|
180
|
+
|
181
|
+
# Save all
|
182
|
+
activity_object.media = media unless media.blank?
|
183
|
+
activity_object.rating_summary = rating_summary unless rating_summary.blank?
|
184
|
+
activity_object.save
|
185
|
+
activity.object = activity_object
|
186
|
+
activity.save
|
187
|
+
|
188
|
+
@profile.activities << activity
|
189
|
+
@profile.save
|
190
|
+
end
|
191
|
+
|
192
|
+
end
|
193
|
+
|
194
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
|
2
|
+
module ActivityMapper
|
3
|
+
|
4
|
+
class DeliciousServiceModule < ServiceModule
|
5
|
+
ACTIVITY_MAP = {
|
6
|
+
nil => {
|
7
|
+
'activity.occurred_at' => 'dt',
|
8
|
+
'activity.caption' => 'd',
|
9
|
+
'activity_object.title' => 'd',
|
10
|
+
'activity_object.body' => 'd',
|
11
|
+
'activity_object.tag_list' => 't',
|
12
|
+
'activity_object.url' => 'u',
|
13
|
+
'activity.url' => 'u'
|
14
|
+
}
|
15
|
+
}
|
16
|
+
ACCEPTED_HOSTS = [/delicious\.com/, /del\.icio\.us/]
|
17
|
+
|
18
|
+
def create_or_update_summary!(options = {})
|
19
|
+
@profile.update_attributes(:username => self.class.username_from_url(@profile.url))
|
20
|
+
end
|
21
|
+
|
22
|
+
def aggregate_activity!(options = {})
|
23
|
+
mapper = ActivityDataMapper.new(ACTIVITY_MAP)
|
24
|
+
|
25
|
+
mapper.fetch!(
|
26
|
+
"http://feeds.delicious.com/v2/json/#{@profile.username}?count=20",
|
27
|
+
:format => :json
|
28
|
+
)
|
29
|
+
mapper.map!
|
30
|
+
mapper.entries.sort! { |e2,e1|
|
31
|
+
e1['activity.occurred_at'] <=> e2['activity.occurred_at']
|
32
|
+
}
|
33
|
+
mapper.entries.each do |entry|
|
34
|
+
break if Activity.exists?(@profile.user_id, entry)
|
35
|
+
create_activity(entry, ActivityObjectType::BOOKMARK, ActivityVerb::POST)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
|
2
|
+
module ActivityMapper
|
3
|
+
|
4
|
+
class FlickrServiceModule < ServiceModule
|
5
|
+
ACTIVITY_MAP = {
|
6
|
+
'items' => {
|
7
|
+
'activity.occurred_at' => 'published',
|
8
|
+
'activity.caption' => 'title',
|
9
|
+
'activity_object.title' => 'title',
|
10
|
+
'activity_object.body' => 'description',
|
11
|
+
'activity_object.spaced_tags' => 'tags',
|
12
|
+
'activity_object.url' => 'link',
|
13
|
+
'activity.url' => 'link',
|
14
|
+
'media.thumbnail_url' => 'media/m',
|
15
|
+
'media.embed_url' => 'media/m'
|
16
|
+
}
|
17
|
+
}
|
18
|
+
ACCEPTED_HOSTS = [/flickr\.com/]
|
19
|
+
|
20
|
+
def create_or_update_summary!(options = {})
|
21
|
+
@profile.update_attributes(:username => self.class.username_from_url(@profile.url))
|
22
|
+
if @profile.native_id.blank?
|
23
|
+
response_body = Connector.fetch("http://api.flickr.com/services/rest/?method=flickr.urls.lookupUser&api_key=#{FLICKR_API_KEY}&url=#{CGI.escape(@profile.url)}&format=json&nojsoncallback=1")
|
24
|
+
profile = Connector.deserialize(response_body, :json)
|
25
|
+
if profile['user']
|
26
|
+
@profile.native_id = profile['user']['id']
|
27
|
+
@profile.username = profile['user']['username']['_content'] if profile['user']['username']
|
28
|
+
@profile.save
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def aggregate_activity!(options = {})
|
34
|
+
mapper = ActivityDataMapper.new(ACTIVITY_MAP)
|
35
|
+
|
36
|
+
mapper.fetch!(
|
37
|
+
"http://api.flickr.com/services/feeds/photos_public.gne?id=#{@profile.native_id}&lang=en-us&format=json&nojsoncallback=1",
|
38
|
+
:format => :json
|
39
|
+
)
|
40
|
+
mapper.map!
|
41
|
+
mapper.entries.each do |entry|
|
42
|
+
next if Activity.exists?(@profile.user_id, entry)
|
43
|
+
create_activity(entry, ActivityObjectType::PHOTO, ActivityVerb::POST)
|
44
|
+
end
|
45
|
+
|
46
|
+
mapper.fetch!(
|
47
|
+
"http://api.flickr.com/services/feeds/photos_faves.gne?nsid=#{@profile.native_id}&lang=en-us&format=json&nojsoncallback=1",
|
48
|
+
:format => :json
|
49
|
+
)
|
50
|
+
mapper.map!
|
51
|
+
mapper.entries.each do |entry|
|
52
|
+
next if Activity.exists?(@profile.user_id, entry)
|
53
|
+
create_activity(entry, ActivityObjectType::PHOTO, ActivityVerb::FAVORITE)
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|