activity_mapper 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. data/README.textile +211 -0
  2. data/lib/activity_mapper.rb +35 -0
  3. data/lib/activity_mapper/activity_data_mapper.rb +93 -0
  4. data/lib/activity_mapper/connector.rb +71 -0
  5. data/lib/activity_mapper/lexicon.txt +92662 -0
  6. data/lib/activity_mapper/linguistics.rb +147 -0
  7. data/lib/activity_mapper/service_module.rb +194 -0
  8. data/lib/activity_mapper/service_modules.rb +4 -0
  9. data/lib/activity_mapper/service_modules/delicious.rb +41 -0
  10. data/lib/activity_mapper/service_modules/flickr.rb +59 -0
  11. data/lib/activity_mapper/service_modules/twitter.rb +57 -0
  12. data/lib/activity_mapper/service_modules/wakoopa.rb +350 -0
  13. data/lib/activity_mapper/service_modules/youtube.rb +63 -0
  14. data/lib/extensions/uri.rb +42 -0
  15. data/spec/connector_mock.rb +49 -0
  16. data/spec/data/delicious_ac8fdf9b4e304b150bf198b42a1cb1b4.json +1 -0
  17. data/spec/data/delicious_ee55656e0f69242ccf02c3eb0f97b296.json +1 -0
  18. data/spec/data/flickr_2b826afe2906894197d92f7a41c2785c. +1 -0
  19. data/spec/data/flickr_3c25cf51d174ee4bb2d8673e294ce4c0.json +196 -0
  20. data/spec/data/flickr_482967b550afd05993ec4256fa1de388.json +229 -0
  21. data/spec/data/flickr_54c4b36bea4e2b14c783e5c50cba8544.json +9 -0
  22. data/spec/data/flickr_848ea91c1a903d0347d1029bf863132a. +1 -0
  23. data/spec/data/flickr_bcb82142f3d6998cabab3c82fba15ced.json +141 -0
  24. data/spec/data/flickr_e9d78058938c7c845a8de3a2c3526c61. +1 -0
  25. data/spec/data/twitter_227e1ca72a2ba08a1b03be2cd64b681e.json +274 -0
  26. data/spec/data/twitter_5dd0884137bf193d55d446619ec65c7e.json +1 -0
  27. data/spec/data/twitter_dd694fe9ebda58a84f92a859fb1cb79c.json +1 -0
  28. data/spec/data/wakoopa_1a8b94baf4eed4fd0e7ac39dc2f53059.json +1 -0
  29. data/spec/data/youtube_050e71a038359cc8ff7e00161e687b4e.json +1 -0
  30. data/spec/data/youtube_6d895fb0245f47b9e4fcc868d1f91674.json +1 -0
  31. data/spec/data/youtube_a92042581e44f9d3da548e2c7a89849c.json +1 -0
  32. data/spec/data/youtube_bf7b648ff8720edb1db951b7b9a474c4.json +1 -0
  33. data/spec/data/zemanta_suggest_for_tweet_response.xml +405 -0
  34. data/spec/models.rb +114 -0
  35. data/spec/service_modules/delicious_spec.rb +32 -0
  36. data/spec/service_modules/flickr_spec.rb +43 -0
  37. data/spec/service_modules/twitter_spec.rb +44 -0
  38. data/spec/service_modules/wakoopa_spec.rb +35 -0
  39. data/spec/service_modules/youtube_spec.rb +47 -0
  40. data/spec/spec_helper.rb +8 -0
  41. metadata +130 -0
@@ -0,0 +1,147 @@
1
+ # tagger.rb - a Ruby Part of Speech Tagger patterned on the work of Eric Brill. Version 0.1
2
+ #
3
+ # Copyright 2005 Mark Watson. All rights reserved.
4
+ # This software is released under the LGPL
5
+ #
6
+ # Contributor: Pat Eyler
7
+ #
8
+
9
+ module ActivityMapper
10
+
11
+ module Linguistics
12
+
13
+ class Tagger
14
+
15
+ UNINTERESTING_PARTS_OF_SPEECH = ['DT', 'PRP', 'IN', 'CC', 'MD']
16
+
17
+ def initialize()
18
+ @lexicon = {}
19
+ file = File.new(File.join(File.dirname(__FILE__), 'lexicon.txt'), 'r')
20
+ file.each_line {|line|
21
+ toks=line.split
22
+ @lexicon[toks.shift]=toks
23
+ }
24
+ file.close
25
+ end
26
+
27
+ def tokenize(words)
28
+ words.split(/ |,|\.|\:|\;|\'/) #'
29
+ end
30
+
31
+ def self.keywords_for_caption(caption)
32
+ @@tagger ||= self.new
33
+ keywords = []
34
+ all_keywords = @@tagger.tokenize(caption)
35
+ pos_tags = @@tagger.part_of_speech_tag(all_keywords)
36
+ all_keywords.each_with_index do |keyword,i|
37
+ next if UNINTERESTING_PARTS_OF_SPEECH.include?(pos_tags[i])
38
+ next unless keyword.size > 4
39
+ keywords << keyword.downcase
40
+ end
41
+ keywords
42
+ rescue => e
43
+ all_keywords
44
+ end
45
+
46
+ def part_of_speech_tag(text)
47
+
48
+ ## start by tokenizing strings passed in
49
+ if text.class == String then
50
+ text = tokenize(text)
51
+ end
52
+
53
+ ## we only work on arrays. If text isn't an array,
54
+ ## quit now.
55
+ if text.class != Array then
56
+ raise RuntimeError, "can't tokenize #{text.class}"
57
+ end
58
+
59
+ # this looks like an artifact of testing
60
+ # puts "text:",text,"\n"
61
+ ret = []
62
+
63
+ text.each do
64
+ |w| ret << (@lexicon[w] && @lexicon[w][0]) ||
65
+ (@lexicon[w.downcase] && words[w.downcase][0]) ||
66
+ 'NN'
67
+ end
68
+
69
+ ## Now, apply transformational rules:
70
+ text.length.times do |i|
71
+
72
+ ## rule 1: DT, {VBD | VBP} --> DT, NN
73
+ if i > 0 then
74
+ if ret[i - 1] == "DT" then
75
+ if ret[i] == "VBD" or ret[i] == "VBP" or ret[i] == "VB" then
76
+ ret[i] = "NN"
77
+ end
78
+ end
79
+ end
80
+
81
+ ## rule 2: convert a noun to a number (CD) if "." appears in the word
82
+ if ret[i] =~ /^N/ then
83
+ if text[i] =~ /\./ then
84
+ ret[i] = "CD"
85
+ end
86
+ end
87
+
88
+ ## rule 3: convert a noun to a past participle if words[i] ends
89
+ ## with "ed"
90
+ if ret[i] =~ /^N/ && text[i] =~ /ed$/ then
91
+ ret[i] = "VBN"
92
+ end
93
+
94
+ ## rule 4: convert any type to adverb if it ends in "ly"
95
+ if text[i] =~ /ly$/ then
96
+ ret[i] = "RB"
97
+ end
98
+
99
+ ## rule 5: convert a common noun (NN or NNS) to a adjective if
100
+ ## it ends with "al"
101
+ if ret[i] =~ /^NN/ && text[i] =~ /al$/ then
102
+ ret[i] = "JJ"
103
+ end
104
+
105
+ ## rule 6: convert a noun to a verb if the preceeding work is "would"
106
+ if i > 0 then
107
+ if ret[i] =~ /^NN/ then
108
+ if text[i-1].downcase == "would" then
109
+ ret[i] = "VB"
110
+ end
111
+ end
112
+ end
113
+
114
+ ## rule 7: if a word has been categorized as a common noun and
115
+ ## it ends with "s", then set its type to plural common noun (NNS)
116
+ if ret[i] == "NN" && text[i] =~ /s$/ then
117
+ ret[i] = "NNS"
118
+ end
119
+
120
+ ## rule 8: convert a common noun to a present participle
121
+ ## verb (i.e., a gerand)
122
+ if ret[i] =~ /^NN/ && text[i] =~ /ing$/ then
123
+ ret[i] = "VBG"
124
+ end
125
+
126
+ ## rule 9: <noun> <noun 2> --> <noun> <verb> if <noun 2>
127
+ ## can also be a verb
128
+ if i > 0 then
129
+ if ( ( ret[i-1] =~ /^NN/ ) && ( ret[i] =~ /^NN/ ) ) then
130
+ if @lexicon[text[i]].include?("VBN") then
131
+ ret[i] = "VBN"
132
+ end
133
+ if @lexicon[text[i]].include?("VBZ") then
134
+ ret[i] = "VBZ"
135
+ end
136
+ end
137
+ end
138
+
139
+ end
140
+ return ret
141
+ end # def getTag
142
+
143
+ end # class Tagger
144
+
145
+ end
146
+
147
+ end
@@ -0,0 +1,194 @@
1
+
2
+ module ActivityMapper
3
+
4
+ class ServiceModule
5
+
6
+ COMMON_DIRECTIVES = ['user', 'users', 'person', 'people', 'traveller', 'in', 'profile', 'profiles']
7
+
8
+ def initialize(profile)
9
+ @profile = profile
10
+ end
11
+
12
+ def self.all_accepted_hosts
13
+ host_expressions = []
14
+ self.subclasses.each do |service_module_klass|
15
+ host_expressions << eval("#{service_module_klass.to_s}::ACCEPTED_HOSTS")
16
+ end
17
+ host_expressions.flatten!
18
+ end
19
+
20
+ def self.klass_for(url)
21
+ self.subclasses.each do |service_module_klass|
22
+ if service_module_klass.accepts?(url)
23
+ return service_module_klass
24
+ end
25
+ end
26
+ nil
27
+ end
28
+
29
+ # http://groups.google.com/group/comp.lang.ruby/browse_thread/thread/e812c7cef446a96?pli=1
30
+ def self.subclasses
31
+ class_hash = {}
32
+ ObjectSpace.each_object do |obj|
33
+ if Class == obj.class
34
+ if obj.ancestors.include? self
35
+ class_hash[obj] = true
36
+ end
37
+ end
38
+ end
39
+ class_hash.keys.reject { |ch| ch == self }
40
+ end
41
+
42
+ # Based on a profile url, decide if I need to be responsible for this
43
+ def self.accepts?(url)
44
+ u = uri(url)
45
+ return false unless u
46
+
47
+ self::ACCEPTED_HOSTS.each do |host_re|
48
+ if (u.host =~ host_re)
49
+ return true
50
+ end
51
+ end
52
+ false
53
+ end
54
+
55
+ def self.detect_username(profiles)
56
+ usernames = {}
57
+ profiles.each do |me_url|
58
+ username = username_from_url(me_url)
59
+ if username
60
+ usernames[username] ||= 0
61
+ usernames[username] += 1
62
+ end
63
+ end
64
+
65
+ usernames = usernames.to_a
66
+ return [] if usernames.blank?
67
+
68
+ usernames.sort! { |b,a| a.last <=> b.last }
69
+
70
+ # Factor in the shortest username if top 2 matches have the same score
71
+ top_score = usernames[0].last
72
+ if (usernames[1] && usernames[1].last == top_score)
73
+ usernames = [usernames[0], usernames[1]]
74
+ usernames.sort! { |a,b| a.first.size <=> b.first.size }
75
+ end
76
+
77
+ usernames.collect(&:first)
78
+ end
79
+
80
+ # -- Implementables
81
+
82
+ # Update the long term data (eg top used software, biography)
83
+ def create_or_update_summary!(options = {}); raise "Method not implemented"; end
84
+
85
+ # This is called as often as possible, this is to passively aggregate the latest activity
86
+ def aggregate_activity!(options = {}); raise "Method not implemented"; end
87
+
88
+ # -- Optional Implementables
89
+
90
+ # Analyze an individual activity object (eg photo, bookmark, book, software, slide...)
91
+
92
+ def shallow_analysis_on(activity_object)
93
+ return unless activity_object.shallowly_analyzed_at.blank?
94
+
95
+ # Extract social connections from body
96
+ #unless activity_object.body.blank?
97
+ # SocialConnection.generate_from_nanoformats(@profile, activity_object.body)
98
+ #end
99
+
100
+ # Extract tags from title
101
+ unless activity_object.title.blank?
102
+ tags = Linguistics::Tagger.keywords_for_caption(activity_object.title)
103
+ activity_object.tag_list = (activity_object.tag_list || []) + tags
104
+ activity_object.shallowly_analyzed_at = Time.now
105
+ activity_object.save
106
+ end
107
+ end
108
+
109
+ def deep_analysis_on(activity_object)
110
+ return unless activity_object.deeply_analyzed_at.blank?
111
+ return unless activity_object.body.size > 100
112
+ unless activity_object.body.blank?
113
+ @extractor = ZemantaExtractor.new
114
+ tags = @extractor.extract_tags(activity_object.body)
115
+ activity_object.tag_list = activity_object.tag_list + tags
116
+ activity_object.save
117
+ end
118
+ end
119
+
120
+ protected
121
+
122
+ def self.username_from_url(url)
123
+ # Check for http://:host/COMMON_DIRECTIVES/:username
124
+ username_after_directive = url.match(/^http\:\/\/([^\/]+)\/([^\/]+)\/([^\/]+)/)
125
+ if username_after_directive &&
126
+ username_after_directive[3] &&
127
+ COMMON_DIRECTIVES.include?(username_after_directive[2])
128
+ return username_after_directive[3]
129
+ end
130
+
131
+ # Check for http://:host/COMMON_DIRECTIVES?:var=:username
132
+ username_after_directive = url.match(/^http\:\/\/([^\/]+)\/([^\/]+)\?\w+\=([^\/]+)/)
133
+ if username_after_directive &&
134
+ username_after_directive[3] &&
135
+ COMMON_DIRECTIVES.include?(username_after_directive[2])
136
+ return username_after_directive[3]
137
+ end
138
+
139
+ # Check for http://:host/username
140
+ username_ending = url.match(/^http\:\/\/([^\/]+)\/([^\/]+)\/*$/)
141
+ if username_ending
142
+ return username_ending[2]
143
+ end
144
+
145
+ nil
146
+ end
147
+
148
+ def self.uri(url)
149
+ URI.parse(url)
150
+ rescue => e
151
+ nil
152
+ end
153
+
154
+ def create_activity(mapped_entry, activity_object_type, activity_verb, additional_activity_parameters = {}, &block)
155
+ @object_names ||= mapped_entry.keys.collect { |oa| oa.split('.').first }
156
+
157
+ return unless mapped_entry['activity_object.url'] # Need proper warning here
158
+
159
+ # Create entity pool
160
+ activity = Activity.create(additional_activity_parameters.merge(
161
+ :user_id => @profile.user_id,
162
+ :verb => activity_verb
163
+ ))
164
+ content_identifier = ActivityObject.content_identifier(mapped_entry['activity_object.url'])
165
+ activity_object = ActivityObject.fetch(content_identifier, activity_object_type.id)
166
+ activity_object ||= ActivityObject.create(:activity_object_type_id => activity_object_type.id)
167
+ media = @object_names.include?('media') ? Media.create : nil
168
+ rating_summary = @object_names.include?('rating_summary') ? RatingSummary.create : nil
169
+
170
+ # Auto-populate attributes
171
+ mapped_entry.each do |destination, value|
172
+ eval("#{destination} = value")
173
+ end
174
+
175
+ if block
176
+ block.call(activity, activity_object)
177
+ end
178
+
179
+ shallow_analysis_on(activity_object)
180
+
181
+ # Save all
182
+ activity_object.media = media unless media.blank?
183
+ activity_object.rating_summary = rating_summary unless rating_summary.blank?
184
+ activity_object.save
185
+ activity.object = activity_object
186
+ activity.save
187
+
188
+ @profile.activities << activity
189
+ @profile.save
190
+ end
191
+
192
+ end
193
+
194
+ end
@@ -0,0 +1,4 @@
1
+
2
+ Dir.open(File.join(File.dirname(__FILE__), 'service_modules')).each do |file|
3
+ require File.join(File.dirname(__FILE__), 'service_modules', file) if File.extname(file) == '.rb'
4
+ end
@@ -0,0 +1,41 @@
1
+
2
+ module ActivityMapper
3
+
4
+ class DeliciousServiceModule < ServiceModule
5
+ ACTIVITY_MAP = {
6
+ nil => {
7
+ 'activity.occurred_at' => 'dt',
8
+ 'activity.caption' => 'd',
9
+ 'activity_object.title' => 'd',
10
+ 'activity_object.body' => 'd',
11
+ 'activity_object.tag_list' => 't',
12
+ 'activity_object.url' => 'u',
13
+ 'activity.url' => 'u'
14
+ }
15
+ }
16
+ ACCEPTED_HOSTS = [/delicious\.com/, /del\.icio\.us/]
17
+
18
+ def create_or_update_summary!(options = {})
19
+ @profile.update_attributes(:username => self.class.username_from_url(@profile.url))
20
+ end
21
+
22
+ def aggregate_activity!(options = {})
23
+ mapper = ActivityDataMapper.new(ACTIVITY_MAP)
24
+
25
+ mapper.fetch!(
26
+ "http://feeds.delicious.com/v2/json/#{@profile.username}?count=20",
27
+ :format => :json
28
+ )
29
+ mapper.map!
30
+ mapper.entries.sort! { |e2,e1|
31
+ e1['activity.occurred_at'] <=> e2['activity.occurred_at']
32
+ }
33
+ mapper.entries.each do |entry|
34
+ break if Activity.exists?(@profile.user_id, entry)
35
+ create_activity(entry, ActivityObjectType::BOOKMARK, ActivityVerb::POST)
36
+ end
37
+ end
38
+
39
+ end
40
+
41
+ end
@@ -0,0 +1,59 @@
1
+
2
+ module ActivityMapper
3
+
4
+ class FlickrServiceModule < ServiceModule
5
+ ACTIVITY_MAP = {
6
+ 'items' => {
7
+ 'activity.occurred_at' => 'published',
8
+ 'activity.caption' => 'title',
9
+ 'activity_object.title' => 'title',
10
+ 'activity_object.body' => 'description',
11
+ 'activity_object.spaced_tags' => 'tags',
12
+ 'activity_object.url' => 'link',
13
+ 'activity.url' => 'link',
14
+ 'media.thumbnail_url' => 'media/m',
15
+ 'media.embed_url' => 'media/m'
16
+ }
17
+ }
18
+ ACCEPTED_HOSTS = [/flickr\.com/]
19
+
20
+ def create_or_update_summary!(options = {})
21
+ @profile.update_attributes(:username => self.class.username_from_url(@profile.url))
22
+ if @profile.native_id.blank?
23
+ response_body = Connector.fetch("http://api.flickr.com/services/rest/?method=flickr.urls.lookupUser&api_key=#{FLICKR_API_KEY}&url=#{CGI.escape(@profile.url)}&format=json&nojsoncallback=1")
24
+ profile = Connector.deserialize(response_body, :json)
25
+ if profile['user']
26
+ @profile.native_id = profile['user']['id']
27
+ @profile.username = profile['user']['username']['_content'] if profile['user']['username']
28
+ @profile.save
29
+ end
30
+ end
31
+ end
32
+
33
+ def aggregate_activity!(options = {})
34
+ mapper = ActivityDataMapper.new(ACTIVITY_MAP)
35
+
36
+ mapper.fetch!(
37
+ "http://api.flickr.com/services/feeds/photos_public.gne?id=#{@profile.native_id}&lang=en-us&format=json&nojsoncallback=1",
38
+ :format => :json
39
+ )
40
+ mapper.map!
41
+ mapper.entries.each do |entry|
42
+ next if Activity.exists?(@profile.user_id, entry)
43
+ create_activity(entry, ActivityObjectType::PHOTO, ActivityVerb::POST)
44
+ end
45
+
46
+ mapper.fetch!(
47
+ "http://api.flickr.com/services/feeds/photos_faves.gne?nsid=#{@profile.native_id}&lang=en-us&format=json&nojsoncallback=1",
48
+ :format => :json
49
+ )
50
+ mapper.map!
51
+ mapper.entries.each do |entry|
52
+ next if Activity.exists?(@profile.user_id, entry)
53
+ create_activity(entry, ActivityObjectType::PHOTO, ActivityVerb::FAVORITE)
54
+ end
55
+ end
56
+
57
+ end
58
+
59
+ end