musicbrainz_automatcher 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README +26 -0
- data/lib/musicbrainz_automatcher.rb +376 -0
- metadata +106 -0
data/README
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
= musicbrainz_automatcher
|
2
|
+
|
3
|
+
* http://github.com/metade/musicbrainz_automatcher
|
4
|
+
|
5
|
+
== DESCRIPTION:
|
6
|
+
|
7
|
+
musicbrainz_automatcher matches artists/tracks names to MusicBrainz intelligently
|
8
|
+
|
9
|
+
== EXAMPLE USE:
|
10
|
+
|
11
|
+
require 'rubygems'
|
12
|
+
require 'musicbrainz_automatcher'
|
13
|
+
|
14
|
+
m = MusicbrainzAutomatcher.new
|
15
|
+
m.match_artist 'Coldplay', 'Yellow'
|
16
|
+
|
17
|
+
=> "cc197bad-dc9c-440d-a5b5-d52ba2e14234"
|
18
|
+
|
19
|
+
== REQUIREMENTS:
|
20
|
+
|
21
|
+
* depends on rbrainz, active_support, iconv and text
|
22
|
+
|
23
|
+
== AUTHORS:
|
24
|
+
|
25
|
+
Nicholas Humfrey
|
26
|
+
Patrick Sinclair
|
@@ -0,0 +1,376 @@
|
|
1
|
+
require 'active_support'
|
2
|
+
require 'rbrainz'
|
3
|
+
require 'iconv'
|
4
|
+
require 'text'
|
5
|
+
|
6
|
+
# Monkey patch String class to add our escaping method
|
7
|
+
class String
|
8
|
+
def lucene_escape_query
|
9
|
+
return self.gsub(/([+\-|!(){}\[\]\^'"~*?:\\])/) {|s| '\\'+s}
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
# Class to automatically match an artist and track to a MusicBrainz artist
|
14
|
+
class MusicbrainzAutomatcher
|
15
|
+
attr_accessor :logger
|
16
|
+
attr_accessor :network_timeout
|
17
|
+
attr_accessor :network_retries
|
18
|
+
attr_reader :cache
|
19
|
+
attr_reader :mbws
|
20
|
+
|
21
|
+
def initialize(options={})
|
22
|
+
# Configuration options
|
23
|
+
@network_timeout = options[:network_timeout] || 15 # seconds
|
24
|
+
@network_retries = options[:network_retries] || 3
|
25
|
+
|
26
|
+
# Create MusicBrainz webservice
|
27
|
+
host = options[:musicbrainz_host] || 'musicbrainz.org'
|
28
|
+
@mbws = MusicBrainz::Webservice::Webservice.new(:host => host, :proxy => options[:proxy])
|
29
|
+
@mbws.open_timeout = @network_timeout
|
30
|
+
@mbws.read_timeout = @network_timeout
|
31
|
+
|
32
|
+
# Create a query cache
|
33
|
+
@cache = ActiveSupport::Cache.lookup_store(options[:cache_type] || :memory_store)
|
34
|
+
|
35
|
+
# Create a logger
|
36
|
+
@logger = options[:logger] || Logger.new(STDOUT)
|
37
|
+
end
|
38
|
+
|
39
|
+
|
40
|
+
# Given an array of artists and a track title, return an rbrainz artist object.
|
41
|
+
# If there is no match in MusicBrainz, then false is returned
|
42
|
+
def match_artist(artists, title=nil)
|
43
|
+
|
44
|
+
# Only interested in first item of title array
|
45
|
+
title = title.first if title.is_a?(Array)
|
46
|
+
|
47
|
+
# Remove excess whitespace from the title
|
48
|
+
title.strip! unless title.nil?
|
49
|
+
|
50
|
+
# Clean and split the artist names
|
51
|
+
artists = clean_artists( artists )
|
52
|
+
|
53
|
+
# Return false if no artist names given
|
54
|
+
return false if artists.empty?
|
55
|
+
|
56
|
+
# Set title to nil, if it is an empty string
|
57
|
+
title = nil if !title.nil? and title.size<1
|
58
|
+
|
59
|
+
# Perform the query if it isn't already cached
|
60
|
+
artist = join_artists( artists )
|
61
|
+
do_cached( "artists=#{artist} title=#{title}" ) do
|
62
|
+
|
63
|
+
# Remove items from the artist array until we get a match
|
64
|
+
mbartist_id = false
|
65
|
+
|
66
|
+
## Ignore if artist name contains two consecutive stars (they contain a sware words)
|
67
|
+
unless artist =~ /\*\*/
|
68
|
+
|
69
|
+
## First: lookup based on track name and artist
|
70
|
+
unless title.nil?
|
71
|
+
mbartist_id = lookup_by_track( artist, title )
|
72
|
+
|
73
|
+
## Second: try removing brackets from the track name
|
74
|
+
if !mbartist_id
|
75
|
+
matches = title.match(/^(.+)\s+\(.+\)$/)
|
76
|
+
mbartist_id = lookup_by_track( artist, matches[1] ) unless matches.nil?
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
## Third: look-up just based on artist name
|
81
|
+
# (but not after we have removed an artist from the stack)
|
82
|
+
if !mbartist_id
|
83
|
+
# Also cache the lookup, just based on the artist name
|
84
|
+
mbartist_id = do_cached( "artist_name=#{artist}" ) do
|
85
|
+
lookup_by_artist( artist )
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
# Response is the MusicBrainz ID
|
91
|
+
mbartist_id
|
92
|
+
end
|
93
|
+
|
94
|
+
end
|
95
|
+
|
96
|
+
protected
|
97
|
+
|
98
|
+
## Perform a block if key isn't already cached.
|
99
|
+
def do_cached( key, &block )
|
100
|
+
# have a look in the cache
|
101
|
+
value = @cache.fetch( key )
|
102
|
+
|
103
|
+
# Cache HIT?
|
104
|
+
return value unless value.nil?
|
105
|
+
|
106
|
+
# Cache MISS : execute the block
|
107
|
+
value = block.call( key )
|
108
|
+
|
109
|
+
# Store value in the cache
|
110
|
+
return @cache.write( key, value, :expires_at => Time.parse("18:00"))
|
111
|
+
end
|
112
|
+
|
113
|
+
# Clean up the artist name array
|
114
|
+
def clean_artists(artists)
|
115
|
+
|
116
|
+
# Turn the artists into an array, if it isn't already
|
117
|
+
artists = [artists] unless artists.is_a?(Array)
|
118
|
+
|
119
|
+
# Split up artist names
|
120
|
+
artists.map! { |a| a.split(/\s+featuring\s+/i) }.flatten!
|
121
|
+
artists.map! { |a| a.split(/\s+feat\.?\s+/i) }.flatten!
|
122
|
+
artists.map! { |a| a.split(/\s+ft\.?\s+/i) }.flatten!
|
123
|
+
artists.map! { |a| a.split(/\s+vs\.?\s+/i) }.flatten!
|
124
|
+
artists.map! { |a| a.split(/\//) }.flatten!
|
125
|
+
artists.map! { |a| a.split(/\&/) }.flatten!
|
126
|
+
|
127
|
+
# Remove whitespace from start and end of artist names
|
128
|
+
artists.each {|a| a.strip! }
|
129
|
+
|
130
|
+
# Delete any empty artist names
|
131
|
+
artists.delete_if { |a| a.blank? }
|
132
|
+
|
133
|
+
return artists
|
134
|
+
end
|
135
|
+
|
136
|
+
# Concatinate an array of artist names together into a single string
|
137
|
+
def join_artists(array)
|
138
|
+
return "" if array.nil? or array.size<1
|
139
|
+
return array.last if array.size==1
|
140
|
+
|
141
|
+
rest = array.slice(0,array.size-1).join(', ')
|
142
|
+
rest += " and " if (rest.size>0)
|
143
|
+
return rest+array.last
|
144
|
+
end
|
145
|
+
|
146
|
+
|
147
|
+
## Remove accents, remove non-word characters, remove whitespace
|
148
|
+
def compact_string(str)
|
149
|
+
ascii = Iconv.iconv("ascii//IGNORE//TRANSLIT", "utf-8", str).join
|
150
|
+
ascii.downcase!
|
151
|
+
ascii.gsub!('&', ' and ')
|
152
|
+
return ascii.gsub(/[\W_]+/, "")
|
153
|
+
end
|
154
|
+
|
155
|
+
|
156
|
+
# Return the highest of two numbers
|
157
|
+
# Isn't there a ruby built-in to do this?
|
158
|
+
def max(i1, i2)
|
159
|
+
i1>i2 ? i1 : i2
|
160
|
+
end
|
161
|
+
|
162
|
+
# How similar are two strings?
|
163
|
+
def string_percent_similar(str1, str2)
|
164
|
+
# Optimisation: Completely identical? (give it a 1% boost!)
|
165
|
+
return 101 if str1 == str2
|
166
|
+
|
167
|
+
# Catch iconv failures
|
168
|
+
begin
|
169
|
+
s1 = compact_string(str1)
|
170
|
+
s2 = compact_string(str2)
|
171
|
+
rescue Iconv::IllegalSequence
|
172
|
+
# Not similar
|
173
|
+
return 0
|
174
|
+
end
|
175
|
+
|
176
|
+
# Don't allow empty strings to match
|
177
|
+
return 0 if s1.size==0 or s2.size==0
|
178
|
+
|
179
|
+
# How similar are the two strings?
|
180
|
+
distance = Text::Levenshtein::distance( s1, s2 )
|
181
|
+
length = max( s1.size, s2.size ).to_f
|
182
|
+
percent = ((length-distance.to_f)/length) * 100
|
183
|
+
|
184
|
+
return percent
|
185
|
+
end
|
186
|
+
|
187
|
+
# Compare two artist names are return how similar they are (in percent)
|
188
|
+
def artist_names_similarity(mbartist, artist_name2)
|
189
|
+
|
190
|
+
# Compare the two arists names
|
191
|
+
best_score = string_percent_similar( mbartist.name, artist_name2 )
|
192
|
+
@logger.debug("Comparing artist '#{mbartist.name}' with '#{artist_name2}' : similarity=#{best_score.to_i}%")
|
193
|
+
|
194
|
+
# Optimisation: can't do better than 100% similar
|
195
|
+
return best_score if best_score >= 100
|
196
|
+
|
197
|
+
# Fetch the artist's aliases
|
198
|
+
aliases = get_artist_aliases(mbartist.id.uuid)
|
199
|
+
|
200
|
+
# Compare with each of the aliases
|
201
|
+
unless aliases.nil?
|
202
|
+
aliases.each do |artist_alias|
|
203
|
+
# Compare the artist alias name
|
204
|
+
percent = string_percent_similar( artist_alias, artist_name2 )
|
205
|
+
@logger.debug("Comparing alias '#{artist_alias}' with '#{artist_name2}' : similarity=#{percent.to_i}%")
|
206
|
+
best_score = percent if (best_score < percent)
|
207
|
+
end
|
208
|
+
end
|
209
|
+
|
210
|
+
return best_score
|
211
|
+
end
|
212
|
+
|
213
|
+
|
214
|
+
## Lookup artist based on an artist and track name
|
215
|
+
# Returns musicbrainz artist gid, or false if no match found
|
216
|
+
def lookup_by_track(artist, title)
|
217
|
+
@logger.info("Looking up '#{artist}' with track '#{title}'")
|
218
|
+
|
219
|
+
# Create a new track filter
|
220
|
+
filter = MusicbrainzAutomatcher::new_track_filter(artist, title)
|
221
|
+
|
222
|
+
# Query MusicBrainz server, but catch any errors
|
223
|
+
attempt = 0
|
224
|
+
begin
|
225
|
+
q = MusicBrainz::Webservice::Query.new(@mbws)
|
226
|
+
results = q.get_tracks(filter)
|
227
|
+
rescue Exception => e
|
228
|
+
@logger.error("Error querying MusicBrainz for artist (attempt #{attempt}): #{e.inspect}")
|
229
|
+
sleep((attempt+=1)**2)
|
230
|
+
retry if attempt < @network_retries
|
231
|
+
raise e
|
232
|
+
end
|
233
|
+
|
234
|
+
matched_mbid = false
|
235
|
+
for result in results
|
236
|
+
## Abort if score is less than 75%
|
237
|
+
break if (result.score < 75)
|
238
|
+
|
239
|
+
@logger.debug(" Score: "+result.score.to_s)
|
240
|
+
@logger.debug(" title: "+result.entity.title)
|
241
|
+
@logger.debug(" artist: "+result.entity.artist.name)
|
242
|
+
@logger.debug(" artist mbid: "+result.entity.artist.id.uuid)
|
243
|
+
|
244
|
+
# Optimisation: skip if it is an artist we have already matched to
|
245
|
+
next if matched_mbid == result.entity.artist.id.uuid
|
246
|
+
|
247
|
+
# Compare the artist names
|
248
|
+
if (artist_names_similarity(result.entity.artist, artist)<75)
|
249
|
+
@logger.debug(" artist name similarity is less than 75%, skipping.")
|
250
|
+
next
|
251
|
+
end
|
252
|
+
|
253
|
+
## More than one artist?
|
254
|
+
if (matched_mbid != result.entity.artist.id.uuid and matched_mbid)
|
255
|
+
@logger.info(" Found more then one artist with a high score, giving up.")
|
256
|
+
return false
|
257
|
+
else
|
258
|
+
matched_mbid = result.entity.artist.id.uuid
|
259
|
+
end
|
260
|
+
end
|
261
|
+
|
262
|
+
# Did we find something?
|
263
|
+
if matched_mbid
|
264
|
+
## Yay!
|
265
|
+
@logger.info(" Matched to artist ID: #{matched_mbid}")
|
266
|
+
return matched_mbid
|
267
|
+
else
|
268
|
+
# didn't find anything :(
|
269
|
+
@logger.info(" Lookup by track failed")
|
270
|
+
return false
|
271
|
+
end
|
272
|
+
|
273
|
+
end
|
274
|
+
|
275
|
+
|
276
|
+
## Lookup artist, just based on its name
|
277
|
+
# Returns musicbrainz artist gid, or false if no match found
|
278
|
+
def lookup_by_artist(name)
|
279
|
+
@logger.info("Looking up '#{name}' just by name")
|
280
|
+
|
281
|
+
filter = MusicBrainz::Webservice::ArtistFilter.new(
|
282
|
+
:name => name,
|
283
|
+
:limit => 20
|
284
|
+
)
|
285
|
+
|
286
|
+
# Query MusicBrainz server, but catch any errors
|
287
|
+
attempt = 0
|
288
|
+
begin
|
289
|
+
q = MusicBrainz::Webservice::Query.new(@mbws)
|
290
|
+
results = q.get_artists(filter)
|
291
|
+
rescue Exception => e
|
292
|
+
@logger.error("Error querying MusicBrainz for artist (attempt #{attempt}): #{e.inspect}")
|
293
|
+
sleep((attempt+=1)**2)
|
294
|
+
retry if attempt < @network_retries
|
295
|
+
raise e
|
296
|
+
end
|
297
|
+
|
298
|
+
similarities = {}
|
299
|
+
for result in results
|
300
|
+
## Give up if score is less than 50%
|
301
|
+
break if (result.score < 50)
|
302
|
+
|
303
|
+
## Work out how similar the artist names are
|
304
|
+
similarity = artist_names_similarity(result.entity, name).to_i
|
305
|
+
next if similarity<=0
|
306
|
+
|
307
|
+
## Store it in the hash
|
308
|
+
@logger.debug(" Score: #{result.score}")
|
309
|
+
@logger.debug(" name: #{result.entity.name}")
|
310
|
+
@logger.debug(" similarity: #{similarity}")
|
311
|
+
similarities[similarity] ||= [];
|
312
|
+
similarities[similarity] << result.entity
|
313
|
+
end
|
314
|
+
|
315
|
+
if similarities.keys.size < 1
|
316
|
+
@logger.info(" No matches found when looking up 'just by name")
|
317
|
+
return false
|
318
|
+
end
|
319
|
+
|
320
|
+
## Order by similarity
|
321
|
+
most_similar = similarities.keys.sort.last
|
322
|
+
if most_similar < 85
|
323
|
+
@logger.info(" Closest match is less than 85% similar")
|
324
|
+
return false
|
325
|
+
elsif similarities[most_similar].length != 1
|
326
|
+
@logger.info(" More then one shortest distance, giving up")
|
327
|
+
return false
|
328
|
+
else
|
329
|
+
rbartist = similarities[most_similar].first
|
330
|
+
@logger.debug(" Found artist by name: #{rbartist.id.uuid}")
|
331
|
+
return rbartist.id.uuid
|
332
|
+
end
|
333
|
+
end
|
334
|
+
|
335
|
+
def get_artist_aliases(mbid)
|
336
|
+
# Hack to stop hitting the MusicBrainz server for Various Artists
|
337
|
+
return [] if mbid == '89ad4ac3-39f7-470e-963a-56509c546377'
|
338
|
+
|
339
|
+
attempt = 0
|
340
|
+
begin
|
341
|
+
q = MusicBrainz::Webservice::Query.new(@mbws)
|
342
|
+
artist_includes = MusicBrainz::Webservice::ArtistIncludes.new( :aliases => true )
|
343
|
+
response = q.get_artist_by_id(mbid, artist_includes)
|
344
|
+
if response.nil?
|
345
|
+
aliases = []
|
346
|
+
else
|
347
|
+
aliases = response.aliases.map { |a| a.name }
|
348
|
+
end
|
349
|
+
|
350
|
+
# Add fake "The Artist" alias if it isn't already there
|
351
|
+
if !response.nil? and response.name !~ /^The /i and !aliases.include?("The #{response.name}")
|
352
|
+
aliases << "The #{response.name}"
|
353
|
+
end
|
354
|
+
|
355
|
+
return aliases
|
356
|
+
rescue Exception => e
|
357
|
+
@logger.error("Error querying MusicBrainz for artist aliases (attempt #{attempt}): #{e.inspect}")
|
358
|
+
sleep((attempt+=1)**2)
|
359
|
+
retry if attempt < @network_retries
|
360
|
+
raise e
|
361
|
+
end
|
362
|
+
end
|
363
|
+
|
364
|
+
|
365
|
+
## A custom track filter, that uses a query more like the normal MusicBrainz search page
|
366
|
+
def self.new_track_filter(artist, title)
|
367
|
+
# Escape the strings
|
368
|
+
tterm = title.lucene_escape_query
|
369
|
+
aterm = artist.lucene_escape_query
|
370
|
+
|
371
|
+
filter = MusicBrainz::Webservice::TrackFilter.new(
|
372
|
+
:query => "artist:(#{aterm})(sortname:(#{aterm}) alias:(#{aterm}) !artist:(#{aterm})) track:#{tterm}",
|
373
|
+
:limit => 20
|
374
|
+
)
|
375
|
+
end
|
376
|
+
end
|
metadata
ADDED
@@ -0,0 +1,106 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: musicbrainz_automatcher
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Patrick Sinclair
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2010-01-06 00:00:00 +00:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: Text
|
17
|
+
type: :runtime
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ~>
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: "1.1"
|
24
|
+
version:
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: rbrainz
|
27
|
+
type: :runtime
|
28
|
+
version_requirement:
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ~>
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: "0.5"
|
34
|
+
version:
|
35
|
+
- !ruby/object:Gem::Dependency
|
36
|
+
name: activesupport
|
37
|
+
type: :runtime
|
38
|
+
version_requirement:
|
39
|
+
version_requirements: !ruby/object:Gem::Requirement
|
40
|
+
requirements:
|
41
|
+
- - ~>
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: "2.3"
|
44
|
+
version:
|
45
|
+
- !ruby/object:Gem::Dependency
|
46
|
+
name: rspec
|
47
|
+
type: :development
|
48
|
+
version_requirement:
|
49
|
+
version_requirements: !ruby/object:Gem::Requirement
|
50
|
+
requirements:
|
51
|
+
- - ">="
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: "0"
|
54
|
+
version:
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: mocha
|
57
|
+
type: :development
|
58
|
+
version_requirement:
|
59
|
+
version_requirements: !ruby/object:Gem::Requirement
|
60
|
+
requirements:
|
61
|
+
- - ">="
|
62
|
+
- !ruby/object:Gem::Version
|
63
|
+
version: "0"
|
64
|
+
version:
|
65
|
+
description:
|
66
|
+
email: metade@gmail.com
|
67
|
+
executables: []
|
68
|
+
|
69
|
+
extensions: []
|
70
|
+
|
71
|
+
extra_rdoc_files:
|
72
|
+
- README
|
73
|
+
files:
|
74
|
+
- README
|
75
|
+
- lib/musicbrainz_automatcher.rb
|
76
|
+
has_rdoc: true
|
77
|
+
homepage: http://github.com/metade/musicbrainz_automatcher
|
78
|
+
licenses: []
|
79
|
+
|
80
|
+
post_install_message:
|
81
|
+
rdoc_options:
|
82
|
+
- --main
|
83
|
+
- README
|
84
|
+
require_paths:
|
85
|
+
- lib
|
86
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
87
|
+
requirements:
|
88
|
+
- - ">="
|
89
|
+
- !ruby/object:Gem::Version
|
90
|
+
version: "0"
|
91
|
+
version:
|
92
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: "0"
|
97
|
+
version:
|
98
|
+
requirements: []
|
99
|
+
|
100
|
+
rubyforge_project: musicbrainz_automatcher
|
101
|
+
rubygems_version: 1.3.5
|
102
|
+
signing_key:
|
103
|
+
specification_version: 3
|
104
|
+
summary: What this thing does
|
105
|
+
test_files: []
|
106
|
+
|