musicbrainz_automatcher 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. data/README +26 -0
  2. data/lib/musicbrainz_automatcher.rb +376 -0
  3. metadata +106 -0
data/README ADDED
@@ -0,0 +1,26 @@
1
+ = musicbrainz_automatcher
2
+
3
+ * http://github.com/metade/musicbrainz_automatcher
4
+
5
+ == DESCRIPTION:
6
+
7
+ musicbrainz_automatcher matches artists/tracks names to MusicBrainz intelligently
8
+
9
+ == EXAMPLE USE:
10
+
11
+ require 'rubygems'
12
+ require 'musicbrainz_automatcher'
13
+
14
+ m = MusicbrainzAutomatcher.new
15
+ m.match_artist 'Coldplay', 'Yellow'
16
+
17
+ => "cc197bad-dc9c-440d-a5b5-d52ba2e14234"
18
+
19
+ == REQUIREMENTS:
20
+
21
+ * depends on rbrainz, active_support, iconv and text
22
+
23
+ == AUTHORS:
24
+
25
+ Nicholas Humfrey
26
+ Patrick Sinclair
@@ -0,0 +1,376 @@
1
+ require 'active_support'
2
+ require 'rbrainz'
3
+ require 'iconv'
4
+ require 'text'
5
+
6
+ # Monkey patch String class to add our escaping method
7
+ class String
8
+ def lucene_escape_query
9
+ return self.gsub(/([+\-|!(){}\[\]\^'"~*?:\\])/) {|s| '\\'+s}
10
+ end
11
+ end
12
+
13
+ # Class to automatically match an artist and track to a MusicBrainz artist
14
+ class MusicbrainzAutomatcher
15
+ attr_accessor :logger
16
+ attr_accessor :network_timeout
17
+ attr_accessor :network_retries
18
+ attr_reader :cache
19
+ attr_reader :mbws
20
+
21
+ def initialize(options={})
22
+ # Configuration options
23
+ @network_timeout = options[:network_timeout] || 15 # seconds
24
+ @network_retries = options[:network_retries] || 3
25
+
26
+ # Create MusicBrainz webservice
27
+ host = options[:musicbrainz_host] || 'musicbrainz.org'
28
+ @mbws = MusicBrainz::Webservice::Webservice.new(:host => host, :proxy => options[:proxy])
29
+ @mbws.open_timeout = @network_timeout
30
+ @mbws.read_timeout = @network_timeout
31
+
32
+ # Create a query cache
33
+ @cache = ActiveSupport::Cache.lookup_store(options[:cache_type] || :memory_store)
34
+
35
+ # Create a logger
36
+ @logger = options[:logger] || Logger.new(STDOUT)
37
+ end
38
+
39
+
40
+ # Given an array of artists and a track title, return an rbrainz artist object.
41
+ # If there is no match in MusicBrainz, then false is returned
42
+ def match_artist(artists, title=nil)
43
+
44
+ # Only interested in first item of title array
45
+ title = title.first if title.is_a?(Array)
46
+
47
+ # Remove excess whitespace from the title
48
+ title.strip! unless title.nil?
49
+
50
+ # Clean and split the artist names
51
+ artists = clean_artists( artists )
52
+
53
+ # Return false if no artist names given
54
+ return false if artists.empty?
55
+
56
+ # Set title to nil, if it is an empty string
57
+ title = nil if !title.nil? and title.size<1
58
+
59
+ # Perform the query if it isn't already cached
60
+ artist = join_artists( artists )
61
+ do_cached( "artists=#{artist} title=#{title}" ) do
62
+
63
+ # Remove items from the artist array until we get a match
64
+ mbartist_id = false
65
+
66
+ ## Ignore if artist name contains two consecutive stars (they contain a sware words)
67
+ unless artist =~ /\*\*/
68
+
69
+ ## First: lookup based on track name and artist
70
+ unless title.nil?
71
+ mbartist_id = lookup_by_track( artist, title )
72
+
73
+ ## Second: try removing brackets from the track name
74
+ if !mbartist_id
75
+ matches = title.match(/^(.+)\s+\(.+\)$/)
76
+ mbartist_id = lookup_by_track( artist, matches[1] ) unless matches.nil?
77
+ end
78
+ end
79
+
80
+ ## Third: look-up just based on artist name
81
+ # (but not after we have removed an artist from the stack)
82
+ if !mbartist_id
83
+ # Also cache the lookup, just based on the artist name
84
+ mbartist_id = do_cached( "artist_name=#{artist}" ) do
85
+ lookup_by_artist( artist )
86
+ end
87
+ end
88
+ end
89
+
90
+ # Response is the MusicBrainz ID
91
+ mbartist_id
92
+ end
93
+
94
+ end
95
+
96
+ protected
97
+
98
+ ## Perform a block if key isn't already cached.
99
+ def do_cached( key, &block )
100
+ # have a look in the cache
101
+ value = @cache.fetch( key )
102
+
103
+ # Cache HIT?
104
+ return value unless value.nil?
105
+
106
+ # Cache MISS : execute the block
107
+ value = block.call( key )
108
+
109
+ # Store value in the cache
110
+ return @cache.write( key, value, :expires_at => Time.parse("18:00"))
111
+ end
112
+
113
+ # Clean up the artist name array
114
+ def clean_artists(artists)
115
+
116
+ # Turn the artists into an array, if it isn't already
117
+ artists = [artists] unless artists.is_a?(Array)
118
+
119
+ # Split up artist names
120
+ artists.map! { |a| a.split(/\s+featuring\s+/i) }.flatten!
121
+ artists.map! { |a| a.split(/\s+feat\.?\s+/i) }.flatten!
122
+ artists.map! { |a| a.split(/\s+ft\.?\s+/i) }.flatten!
123
+ artists.map! { |a| a.split(/\s+vs\.?\s+/i) }.flatten!
124
+ artists.map! { |a| a.split(/\//) }.flatten!
125
+ artists.map! { |a| a.split(/\&/) }.flatten!
126
+
127
+ # Remove whitespace from start and end of artist names
128
+ artists.each {|a| a.strip! }
129
+
130
+ # Delete any empty artist names
131
+ artists.delete_if { |a| a.blank? }
132
+
133
+ return artists
134
+ end
135
+
136
+ # Concatinate an array of artist names together into a single string
137
+ def join_artists(array)
138
+ return "" if array.nil? or array.size<1
139
+ return array.last if array.size==1
140
+
141
+ rest = array.slice(0,array.size-1).join(', ')
142
+ rest += " and " if (rest.size>0)
143
+ return rest+array.last
144
+ end
145
+
146
+
147
+ ## Remove accents, remove non-word characters, remove whitespace
148
+ def compact_string(str)
149
+ ascii = Iconv.iconv("ascii//IGNORE//TRANSLIT", "utf-8", str).join
150
+ ascii.downcase!
151
+ ascii.gsub!('&', ' and ')
152
+ return ascii.gsub(/[\W_]+/, "")
153
+ end
154
+
155
+
156
+ # Return the highest of two numbers
157
+ # Isn't there a ruby built-in to do this?
158
+ def max(i1, i2)
159
+ i1>i2 ? i1 : i2
160
+ end
161
+
162
+ # How similar are two strings?
163
+ def string_percent_similar(str1, str2)
164
+ # Optimisation: Completely identical? (give it a 1% boost!)
165
+ return 101 if str1 == str2
166
+
167
+ # Catch iconv failures
168
+ begin
169
+ s1 = compact_string(str1)
170
+ s2 = compact_string(str2)
171
+ rescue Iconv::IllegalSequence
172
+ # Not similar
173
+ return 0
174
+ end
175
+
176
+ # Don't allow empty strings to match
177
+ return 0 if s1.size==0 or s2.size==0
178
+
179
+ # How similar are the two strings?
180
+ distance = Text::Levenshtein::distance( s1, s2 )
181
+ length = max( s1.size, s2.size ).to_f
182
+ percent = ((length-distance.to_f)/length) * 100
183
+
184
+ return percent
185
+ end
186
+
187
+ # Compare two artist names are return how similar they are (in percent)
188
+ def artist_names_similarity(mbartist, artist_name2)
189
+
190
+ # Compare the two arists names
191
+ best_score = string_percent_similar( mbartist.name, artist_name2 )
192
+ @logger.debug("Comparing artist '#{mbartist.name}' with '#{artist_name2}' : similarity=#{best_score.to_i}%")
193
+
194
+ # Optimisation: can't do better than 100% similar
195
+ return best_score if best_score >= 100
196
+
197
+ # Fetch the artist's aliases
198
+ aliases = get_artist_aliases(mbartist.id.uuid)
199
+
200
+ # Compare with each of the aliases
201
+ unless aliases.nil?
202
+ aliases.each do |artist_alias|
203
+ # Compare the artist alias name
204
+ percent = string_percent_similar( artist_alias, artist_name2 )
205
+ @logger.debug("Comparing alias '#{artist_alias}' with '#{artist_name2}' : similarity=#{percent.to_i}%")
206
+ best_score = percent if (best_score < percent)
207
+ end
208
+ end
209
+
210
+ return best_score
211
+ end
212
+
213
+
214
+ ## Lookup artist based on an artist and track name
215
+ # Returns musicbrainz artist gid, or false if no match found
216
+ def lookup_by_track(artist, title)
217
+ @logger.info("Looking up '#{artist}' with track '#{title}'")
218
+
219
+ # Create a new track filter
220
+ filter = MusicbrainzAutomatcher::new_track_filter(artist, title)
221
+
222
+ # Query MusicBrainz server, but catch any errors
223
+ attempt = 0
224
+ begin
225
+ q = MusicBrainz::Webservice::Query.new(@mbws)
226
+ results = q.get_tracks(filter)
227
+ rescue Exception => e
228
+ @logger.error("Error querying MusicBrainz for artist (attempt #{attempt}): #{e.inspect}")
229
+ sleep((attempt+=1)**2)
230
+ retry if attempt < @network_retries
231
+ raise e
232
+ end
233
+
234
+ matched_mbid = false
235
+ for result in results
236
+ ## Abort if score is less than 75%
237
+ break if (result.score < 75)
238
+
239
+ @logger.debug(" Score: "+result.score.to_s)
240
+ @logger.debug(" title: "+result.entity.title)
241
+ @logger.debug(" artist: "+result.entity.artist.name)
242
+ @logger.debug(" artist mbid: "+result.entity.artist.id.uuid)
243
+
244
+ # Optimisation: skip if it is an artist we have already matched to
245
+ next if matched_mbid == result.entity.artist.id.uuid
246
+
247
+ # Compare the artist names
248
+ if (artist_names_similarity(result.entity.artist, artist)<75)
249
+ @logger.debug(" artist name similarity is less than 75%, skipping.")
250
+ next
251
+ end
252
+
253
+ ## More than one artist?
254
+ if (matched_mbid != result.entity.artist.id.uuid and matched_mbid)
255
+ @logger.info(" Found more then one artist with a high score, giving up.")
256
+ return false
257
+ else
258
+ matched_mbid = result.entity.artist.id.uuid
259
+ end
260
+ end
261
+
262
+ # Did we find something?
263
+ if matched_mbid
264
+ ## Yay!
265
+ @logger.info(" Matched to artist ID: #{matched_mbid}")
266
+ return matched_mbid
267
+ else
268
+ # didn't find anything :(
269
+ @logger.info(" Lookup by track failed")
270
+ return false
271
+ end
272
+
273
+ end
274
+
275
+
276
+ ## Lookup artist, just based on its name
277
+ # Returns musicbrainz artist gid, or false if no match found
278
+ def lookup_by_artist(name)
279
+ @logger.info("Looking up '#{name}' just by name")
280
+
281
+ filter = MusicBrainz::Webservice::ArtistFilter.new(
282
+ :name => name,
283
+ :limit => 20
284
+ )
285
+
286
+ # Query MusicBrainz server, but catch any errors
287
+ attempt = 0
288
+ begin
289
+ q = MusicBrainz::Webservice::Query.new(@mbws)
290
+ results = q.get_artists(filter)
291
+ rescue Exception => e
292
+ @logger.error("Error querying MusicBrainz for artist (attempt #{attempt}): #{e.inspect}")
293
+ sleep((attempt+=1)**2)
294
+ retry if attempt < @network_retries
295
+ raise e
296
+ end
297
+
298
+ similarities = {}
299
+ for result in results
300
+ ## Give up if score is less than 50%
301
+ break if (result.score < 50)
302
+
303
+ ## Work out how similar the artist names are
304
+ similarity = artist_names_similarity(result.entity, name).to_i
305
+ next if similarity<=0
306
+
307
+ ## Store it in the hash
308
+ @logger.debug(" Score: #{result.score}")
309
+ @logger.debug(" name: #{result.entity.name}")
310
+ @logger.debug(" similarity: #{similarity}")
311
+ similarities[similarity] ||= [];
312
+ similarities[similarity] << result.entity
313
+ end
314
+
315
+ if similarities.keys.size < 1
316
+ @logger.info(" No matches found when looking up 'just by name")
317
+ return false
318
+ end
319
+
320
+ ## Order by similarity
321
+ most_similar = similarities.keys.sort.last
322
+ if most_similar < 85
323
+ @logger.info(" Closest match is less than 85% similar")
324
+ return false
325
+ elsif similarities[most_similar].length != 1
326
+ @logger.info(" More then one shortest distance, giving up")
327
+ return false
328
+ else
329
+ rbartist = similarities[most_similar].first
330
+ @logger.debug(" Found artist by name: #{rbartist.id.uuid}")
331
+ return rbartist.id.uuid
332
+ end
333
+ end
334
+
335
+ def get_artist_aliases(mbid)
336
+ # Hack to stop hitting the MusicBrainz server for Various Artists
337
+ return [] if mbid == '89ad4ac3-39f7-470e-963a-56509c546377'
338
+
339
+ attempt = 0
340
+ begin
341
+ q = MusicBrainz::Webservice::Query.new(@mbws)
342
+ artist_includes = MusicBrainz::Webservice::ArtistIncludes.new( :aliases => true )
343
+ response = q.get_artist_by_id(mbid, artist_includes)
344
+ if response.nil?
345
+ aliases = []
346
+ else
347
+ aliases = response.aliases.map { |a| a.name }
348
+ end
349
+
350
+ # Add fake "The Artist" alias if it isn't already there
351
+ if !response.nil? and response.name !~ /^The /i and !aliases.include?("The #{response.name}")
352
+ aliases << "The #{response.name}"
353
+ end
354
+
355
+ return aliases
356
+ rescue Exception => e
357
+ @logger.error("Error querying MusicBrainz for artist aliases (attempt #{attempt}): #{e.inspect}")
358
+ sleep((attempt+=1)**2)
359
+ retry if attempt < @network_retries
360
+ raise e
361
+ end
362
+ end
363
+
364
+
365
+ ## A custom track filter, that uses a query more like the normal MusicBrainz search page
366
+ def self.new_track_filter(artist, title)
367
+ # Escape the strings
368
+ tterm = title.lucene_escape_query
369
+ aterm = artist.lucene_escape_query
370
+
371
+ filter = MusicBrainz::Webservice::TrackFilter.new(
372
+ :query => "artist:(#{aterm})(sortname:(#{aterm}) alias:(#{aterm}) !artist:(#{aterm})) track:#{tterm}",
373
+ :limit => 20
374
+ )
375
+ end
376
+ end
metadata ADDED
@@ -0,0 +1,106 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: musicbrainz_automatcher
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Patrick Sinclair
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2010-01-06 00:00:00 +00:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: Text
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ~>
22
+ - !ruby/object:Gem::Version
23
+ version: "1.1"
24
+ version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: rbrainz
27
+ type: :runtime
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ~>
32
+ - !ruby/object:Gem::Version
33
+ version: "0.5"
34
+ version:
35
+ - !ruby/object:Gem::Dependency
36
+ name: activesupport
37
+ type: :runtime
38
+ version_requirement:
39
+ version_requirements: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ~>
42
+ - !ruby/object:Gem::Version
43
+ version: "2.3"
44
+ version:
45
+ - !ruby/object:Gem::Dependency
46
+ name: rspec
47
+ type: :development
48
+ version_requirement:
49
+ version_requirements: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ version: "0"
54
+ version:
55
+ - !ruby/object:Gem::Dependency
56
+ name: mocha
57
+ type: :development
58
+ version_requirement:
59
+ version_requirements: !ruby/object:Gem::Requirement
60
+ requirements:
61
+ - - ">="
62
+ - !ruby/object:Gem::Version
63
+ version: "0"
64
+ version:
65
+ description:
66
+ email: metade@gmail.com
67
+ executables: []
68
+
69
+ extensions: []
70
+
71
+ extra_rdoc_files:
72
+ - README
73
+ files:
74
+ - README
75
+ - lib/musicbrainz_automatcher.rb
76
+ has_rdoc: true
77
+ homepage: http://github.com/metade/musicbrainz_automatcher
78
+ licenses: []
79
+
80
+ post_install_message:
81
+ rdoc_options:
82
+ - --main
83
+ - README
84
+ require_paths:
85
+ - lib
86
+ required_ruby_version: !ruby/object:Gem::Requirement
87
+ requirements:
88
+ - - ">="
89
+ - !ruby/object:Gem::Version
90
+ version: "0"
91
+ version:
92
+ required_rubygems_version: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: "0"
97
+ version:
98
+ requirements: []
99
+
100
+ rubyforge_project: musicbrainz_automatcher
101
+ rubygems_version: 1.3.5
102
+ signing_key:
103
+ specification_version: 3
104
+ summary: What this thing does
105
+ test_files: []
106
+