metade-link_toad 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +48 -0
- data/examples/musicbrainz/guardian/README +13 -0
- data/examples/musicbrainz/guardian/guardian-music-news.rb +15 -0
- data/examples/musicbrainz/guardian/guardian-musicbrainz.rb +40 -0
- data/examples/musicbrainz/guardian/guardian-musicbrainz.yml +245 -0
- data/lib/link_toad.rb +61 -0
- data/lib/link_toad/music_link_toad.rb +10 -0
- data/spec/link_toad_spec.rb +81 -0
- data/spec/music_link_toad_spec.rb +28 -0
- data/spec/spec_helper.rb +2 -0
- metadata +74 -0
data/README
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
= link_toad
|
2
|
+
|
3
|
+
* http://github.com/metade/link_toad
|
4
|
+
|
5
|
+
== DESCRIPTION:
|
6
|
+
|
7
|
+
LinkToad is a general purpose equivalency engine that uses hyperlinks.
|
8
|
+
|
9
|
+
== USAGE:
|
10
|
+
|
11
|
+
require 'rubygems'
|
12
|
+
require 'link_toad'
|
13
|
+
|
14
|
+
# mapping of artist-related URLs to MusicBrainz artist identifier
|
15
|
+
mapping = {
|
16
|
+
'http://www.arcticmonkeys.com/' => 'ada7a83c-e3e1-40f1-93f9-3e73dbc9298a',
|
17
|
+
}
|
18
|
+
toad = LinkToad.new(mapping)
|
19
|
+
|
20
|
+
# match a news story linking to the Arctic Monkeys homepage
|
21
|
+
toad.match('http://news.bbc.co.uk/1/hi/entertainment/7664082.stm')
|
22
|
+
=> ["ada7a83c-e3e1-40f1-93f9-3e73dbc9298a"]
|
23
|
+
|
24
|
+
See +examples/+ for more details.
|
25
|
+
|
26
|
+
== LICENSE:
|
27
|
+
|
28
|
+
The MIT License
|
29
|
+
|
30
|
+
Copyright (c) 2008 Patrick Sinclair
|
31
|
+
|
32
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
33
|
+
of this software and associated documentation files (the "Software"), to deal
|
34
|
+
in the Software without restriction, including without limitation the rights
|
35
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
36
|
+
copies of the Software, and to permit persons to whom the Software is
|
37
|
+
furnished to do so, subject to the following conditions:
|
38
|
+
|
39
|
+
The above copyright notice and this permission notice shall be included in
|
40
|
+
all copies or substantial portions of the Software.
|
41
|
+
|
42
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
43
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
44
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
45
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
46
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
47
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
48
|
+
THE SOFTWARE.
|
@@ -0,0 +1,13 @@
|
|
1
|
+
These scripts enable news items on the Guardian RSS feed to be associated to a particular MusicBrainz artist.
|
2
|
+
|
3
|
+
guardian-musicbrainz.rb
|
4
|
+
-----------------------
|
5
|
+
|
6
|
+
* pulls in the full list of music related tag from the guardian web site
|
7
|
+
* for each sensible-looking tag it searches MusicBrainz for a matching artist
|
8
|
+
* it outputs a YAML file mapping Guardian tag URLs to artist GUIDs
|
9
|
+
|
10
|
+
guardian-music-news.rb
|
11
|
+
----------------------
|
12
|
+
|
13
|
+
Uses the mapping produced by +guardian-musicbrainz.rb+ to associate stories from the Guardian Music RSS feed to MusicBrainz artist.
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rss/2.0'
|
3
|
+
require 'yaml'
|
4
|
+
require '../../../lib/link_toad.rb'
|
5
|
+
|
6
|
+
mapping = YAML.load_file('guardian-musicbrainz.yml')
|
7
|
+
toad = LinkToad.new(mapping)
|
8
|
+
|
9
|
+
feed = RSS::Parser.parse(open('http://www.guardian.co.uk/music/rss'))
|
10
|
+
feed.items.each do |item|
|
11
|
+
url = item.guid.content
|
12
|
+
gids = toad.match(url)
|
13
|
+
puts "#{url}: #{gids.inspect}"
|
14
|
+
end
|
15
|
+
|
@@ -0,0 +1,40 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'hpricot'
|
3
|
+
require 'rbrainz'
|
4
|
+
require 'open-uri'
|
5
|
+
require 'yaml'
|
6
|
+
|
7
|
+
file = File.open('guardian-musicbrainz.yml', 'w')
|
8
|
+
file.puts('---')
|
9
|
+
|
10
|
+
ignore_list = [
|
11
|
+
# guardian specific
|
12
|
+
/alternativetop100albums/, /bobmarley60thanniversary/,
|
13
|
+
# events
|
14
|
+
/britawards/, /eurovision/, /glastonbury/, /live8/, /meltdownfestival/,
|
15
|
+
/mercuryprize/, /o2wirelessweekenders/, /proms/, /readingandleedsfestival/,
|
16
|
+
# genres
|
17
|
+
/classicalmusicandopera/, /downloads/, /electronicmusic/, /folk/,
|
18
|
+
/jazz/, /popandrock/, /worldmusic/
|
19
|
+
]
|
20
|
+
|
21
|
+
urls = []
|
22
|
+
q = MusicBrainz::Webservice::Query.new
|
23
|
+
doc = Hpricot(open('http://www.guardian.co.uk/music/list/allmusickeywords'))
|
24
|
+
doc.search('//a').each do |link|
|
25
|
+
url = link.attributes['href']
|
26
|
+
next if url.nil?
|
27
|
+
next if urls.include? url
|
28
|
+
next unless url =~ %r[http://www.guardian.co.uk/music/(\w+)$]
|
29
|
+
next if ignore_list.detect { |re| url =~ re }
|
30
|
+
urls << url
|
31
|
+
|
32
|
+
name = link.inner_html.strip
|
33
|
+
results = q.get_artists(MusicBrainz::Webservice::ArtistFilter.new(:query => name))
|
34
|
+
sleep 1 # don't hit MusicBrainz too hard
|
35
|
+
|
36
|
+
if results.size>0
|
37
|
+
m_name, gid = results[0].entity.name, results[0].entity.id.uuid
|
38
|
+
file.puts("#{url}: #{gid} ##{m_name}")
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,245 @@
|
|
1
|
+
---
|
2
|
+
http://www.guardian.co.uk/music/2pac: 382f1005-e9ab-4684-afd4-0bdae4ee37f2 #2Pac
|
3
|
+
http://www.guardian.co.uk/music/50cent: 6e3db4ff-39cc-4675-8310-5ace0cd63bfa #50 Cent & Eminem
|
4
|
+
http://www.guardian.co.uk/music/aaliyah: bc85da58-52d9-457d-ae8d-5d8d4ec870a9 #Aaliyah
|
5
|
+
http://www.guardian.co.uk/music/abba: d87e52c5-bb8d-4da8-b941-9f4928627dc8 #ABBA
|
6
|
+
http://www.guardian.co.uk/music/ryanadams: c80f38a6-9980-485d-997c-5c1a9cbd0d64 #Ryan Adams
|
7
|
+
http://www.guardian.co.uk/music/christinaaguilera: b202beb7-99bd-47e7-8b72-195c8d72ebdd #Christina Aguilera
|
8
|
+
http://www.guardian.co.uk/music/akon: 1138a764-2212-4d0a-b02d-0dc14df91e08 #Akon
|
9
|
+
http://www.guardian.co.uk/music/alltomorrowsparties: 1771f952-52c9-4de4-8c65-8ca95f932652 #All Tomorrow's Party
|
10
|
+
http://www.guardian.co.uk/music/lilyallen: 6e0c7c0e-cba5-4c2c-a652-38f71ef5785d #Lily Allen
|
11
|
+
http://www.guardian.co.uk/music/toriamos: c0b2500e-0cef-4130-869d-732b23ed9df5 #Tori Amos
|
12
|
+
http://www.guardian.co.uk/music/animalcollective: 0c751690-c784-4a4f-b1e4-c1de27d47581 #Animal Collective
|
13
|
+
http://www.guardian.co.uk/music/antonyandthejohnsons: 90cc2464-234e-4da0-b39b-576f36e633bc #Antony and the Johnsons
|
14
|
+
http://www.guardian.co.uk/music/fionaapple: a9ee533f-8871-4f62-a6bb-91eb264abc90 #Fiona Apple
|
15
|
+
http://www.guardian.co.uk/music/arcadefire: 52074ba6-e495-4ef3-9bb4-0703888a9f68 #Arcade Fire
|
16
|
+
http://www.guardian.co.uk/music/arcticmonkeys: ada7a83c-e3e1-40f1-93f9-3e73dbc9298a #Arctic Monkeys
|
17
|
+
http://www.guardian.co.uk/music/richardashcroft: 2d1d8985-47bc-4244-8cc3-577584e411f6 #Richard Ashcroft
|
18
|
+
http://www.guardian.co.uk/music/babyshambles: 8e1e03fe-ebbc-467a-b541-857144db10fb #Babyshambles
|
19
|
+
http://www.guardian.co.uk/music/jsbach: e2f2a608-9a8b-47b7-81d1-04af13632336 #JS
|
20
|
+
http://www.guardian.co.uk/music/badlydrawnboy: 0881daf1-20df-4a3e-a84f-6476a84bb172 #Badly Drawn Boy
|
21
|
+
http://www.guardian.co.uk/music/erykahbadu: 321531fc-db73-4ffa-a959-61a61a2908c1 #Erykah Badu
|
22
|
+
http://www.guardian.co.uk/music/sydbarrett: 12327d75-47d5-45d9-84c2-3760b9210c17 #Syd Barrett
|
23
|
+
http://www.guardian.co.uk/music/basementjaxx: 28cbf94d-0700-4095-a188-37e373b069a7 #Basement Jaxx
|
24
|
+
http://www.guardian.co.uk/music/batforlashes: 10000730-525f-4ed5-aaa8-92888f060f5f #Bat for Lashes
|
25
|
+
http://www.guardian.co.uk/music/battles: 8522b9b6-b295-48d7-9a10-8618fb80beb8 #Battles
|
26
|
+
http://www.guardian.co.uk/music/thebeatles: b10bbbfc-cf9e-42e0-be17-e2c3e1d2600d #The Beatles
|
27
|
+
http://www.guardian.co.uk/music/beck: 0df890e1-f4f2-4b21-a413-cd8af1af32d8 #Christophe Beck
|
28
|
+
http://www.guardian.co.uk/music/belleandsebastian: e5c7b94f-e264-473c-bb0f-37c85d4d5c70 #Belle and Sebastian
|
29
|
+
http://www.guardian.co.uk/music/bjork: 87c5dedd-371d-4a53-9f7f-80522fb7f3cb #Björk
|
30
|
+
http://www.guardian.co.uk/music/blackflag: 9941a936-196a-4a62-ae53-a69cbc33f20e #Black Flag
|
31
|
+
http://www.guardian.co.uk/music/frankblack: 98a35a4d-a845-4282-bc7c-5679f2bb9bd2 #Frank Black
|
32
|
+
http://www.guardian.co.uk/music/blacksabbath: 5182c1d9-c7d2-4dad-afa0-ccfeada921a8 #Black Sabbath
|
33
|
+
http://www.guardian.co.uk/music/blocparty: 8c538f11-c141-4588-8ecb-931083524186 #Bloc Party
|
34
|
+
http://www.guardian.co.uk/music/blondie: 4d2956d1-a3f7-44bb-9a41-67563e1a0c94 #Blondie
|
35
|
+
http://www.guardian.co.uk/music/blur: ba853904-ae25-4ebb-89d6-c44cfbd71bd2 #Blur
|
36
|
+
http://www.guardian.co.uk/music/bonnieprincebilly: 7b0a63b2-b8e8-490e-b724-1d30cb5edfe3 #Bonnie 'Prince' Billy
|
37
|
+
http://www.guardian.co.uk/music/davidbowie: 5441c29d-3602-4898-b1a1-b77fa23b8e50 #David Bowie
|
38
|
+
http://www.guardian.co.uk/music/breeders: 667e856e-f3a7-42e9-8244-95422ef31321 #The Breeders
|
39
|
+
http://www.guardian.co.uk/music/foxybrown: c0235fec-bb90-4976-8da1-5d1b37e158ec #Foxy Brown & Jay-Z
|
40
|
+
http://www.guardian.co.uk/music/ianbrown: 182d5d8a-94c7-4ef4-8222-a1838353a37b #Ian Brown
|
41
|
+
http://www.guardian.co.uk/music/buzzcocks: 31e9c35b-2675-4632-8596-f9bd9286f6c8 #Buzzcocks
|
42
|
+
http://www.guardian.co.uk/music/camdencrawl: b1dffc4b-0b99-4037-a726-0a111e0ca812 #Camden
|
43
|
+
http://www.guardian.co.uk/music/mariahcarey: 494e8d09-f85b-4543-892f-a5096aed1cd4 #Mariah Carey
|
44
|
+
http://www.guardian.co.uk/music/johnnycash: d43d12a1-2dc9-4257-a2fd-0a3bb1081b86 #Johnny Cash
|
45
|
+
http://www.guardian.co.uk/music/charlatans: 8434409e-baa9-4e12-b4aa-566a91c7d7cf #The Charlatans
|
46
|
+
http://www.guardian.co.uk/music/raycharles: 2ce02909-598b-44ef-a456-151ba0a3bd70 #Ray Charles
|
47
|
+
http://www.guardian.co.uk/music/clapyourhandssayyeah: 4b2d6a23-034d-4a29-9bb9-d2462796da4e #Clap Your Hands Say Yeah
|
48
|
+
http://www.guardian.co.uk/music/ericclapton: 618b6900-0618-4f1e-b835-bccb17f84294 #Eric Clapton
|
49
|
+
http://www.guardian.co.uk/music/clash: 8f92558c-2baa-4758-8c38-615519e9deda #The Clash
|
50
|
+
http://www.guardian.co.uk/music/jimmycliff: 2caa54a7-b08c-41da-b892-3a41abe778be #Jimmy Cliff
|
51
|
+
http://www.guardian.co.uk/music/clinic: 1ea2c08f-323c-4207-9af1-e2fb8588f08e #Clinic
|
52
|
+
http://www.guardian.co.uk/music/georgeclinton: 7e22c4e9-1355-48d3-af58-03347e95b5f3 #George S. Clinton
|
53
|
+
http://www.guardian.co.uk/music/clubs: a3751fb8-b8cc-4b43-a1cb-3cb39a074402 #Clubbing
|
54
|
+
http://www.guardian.co.uk/music/coldplay: cc197bad-dc9c-440d-a5b5-d52ba2e14234 #Coldplay
|
55
|
+
http://www.guardian.co.uk/music/alicecooper: 4d7928cd-7ed2-4282-8c29-c0c9f966f1bd #Alice Cooper
|
56
|
+
http://www.guardian.co.uk/music/elviscostello: 8a338e06-d182-46f2-bd16-30a09bc840ba #Elvis Costello
|
57
|
+
http://www.guardian.co.uk/music/cribs: a3a92047-be1c-4f3e-8960-c4f8570984df #The Cribs
|
58
|
+
http://www.guardian.co.uk/music/crosbystillsnashandyoung: 46a782ea-4308-476b-abd1-a91b197f3037 #Crosby, Stills, Nash & Young
|
59
|
+
http://www.guardian.co.uk/music/cypresshill: 51508c1f-8d07-4a00-9cf1-26c570fe7b78 #Cypress Hill
|
60
|
+
http://www.guardian.co.uk/music/dangermouse: 4b356f05-bcc2-4544-925b-fd9a1bf708be #Danger Mouse
|
61
|
+
http://www.guardian.co.uk/music/raydavies: a0ecce6d-7dfd-4e3c-9d1e-26465244450d #Ray Davies & His Funky Trumpet
|
62
|
+
http://www.guardian.co.uk/music/deathcabforcutie: 0039c7ae-e1a7-4a7d-9b49-0cbc716821a6 #Death Cab for Cutie
|
63
|
+
http://www.guardian.co.uk/music/defleppard: 7249b899-8db8-43e7-9e6e-22f1e736024e #Def Leppard
|
64
|
+
http://www.guardian.co.uk/music/devo: 4d8a5bce-7f33-4fd2-bf8f-e8f6cf467373 #Devo Springsteen
|
65
|
+
http://www.guardian.co.uk/music/dinosaurjr: 77c167d2-4965-4421-830a-9815e4956475 #Dinosaur Jr.
|
66
|
+
http://www.guardian.co.uk/music/direstraits: 614e3804-7d34-41ba-857f-811bad7c2b7a #Dire Straits
|
67
|
+
http://www.guardian.co.uk/music/dirtyprettythings: 648615ca-ca74-460d-928a-2bae67ae6d14 #Dirty Pretty Things
|
68
|
+
http://www.guardian.co.uk/music/dizzeerascal: 1a99cc88-aea3-4fe3-96b9-20791667f65f #Dizzee Rascal
|
69
|
+
http://www.guardian.co.uk/music/petedoherty: ff041743-15eb-49db-933f-8bc66a4a3235 #Pete Doherty
|
70
|
+
http://www.guardian.co.uk/music/doors: 9efff43b-3b29-4082-824e-bc82f646f93d #The Doors
|
71
|
+
http://www.guardian.co.uk/music/drdre: 5f6ab597-f57a-40da-be9e-adad48708203 #Dr. Dre
|
72
|
+
http://www.guardian.co.uk/music/duranduran: 2f1baa8d-aad7-4bf5-b5f2-ec857e20129a #Andy Durán
|
73
|
+
http://www.guardian.co.uk/music/bobdylan: 72c536dc-7137-4477-a521-567eeb840fa8 #Bob Dylan
|
74
|
+
http://www.guardian.co.uk/music/eagles: f46bd570-5768-462e-b84c-c7c993bbf47e #Eagles
|
75
|
+
http://www.guardian.co.uk/music/echoandthebunnymen: ccd4879c-5e88-4385-b131-bf65296bf245 #Echo & The Bunnymen
|
76
|
+
http://www.guardian.co.uk/music/eels: 14387b0f-765c-4852-852f-135335790466 #Eels
|
77
|
+
http://www.guardian.co.uk/music/elbow: 3cb3928a-526c-4a3d-93c5-53315fa9bde0 #Elbow
|
78
|
+
http://www.guardian.co.uk/music/missyelliott: a0b8cb9e-7532-45fe-a74c-30e7c4009a39 #Missy Elliott
|
79
|
+
http://www.guardian.co.uk/music/elvis25yearson: 6bd50a1f-c6b6-49ab-97b5-87cc5430f16c #17 Years
|
80
|
+
http://www.guardian.co.uk/music/eminem: b95ce3ff-3d05-4e87-9e01-c97b66af13d4 #Eminem
|
81
|
+
http://www.guardian.co.uk/music/brianeno: ff95eb47-41c4-4f7f-a104-cdc30f02e872 #Brian Eno
|
82
|
+
http://www.guardian.co.uk/music/falloutboy: 516cef4d-0718-4007-9939-f9b38af3f784 #Fall Out Boy
|
83
|
+
http://www.guardian.co.uk/music/fall: d5da1841-9bc8-4813-9f89-11098090148e #The Fall
|
84
|
+
http://www.guardian.co.uk/music/bryanferry: 4ef7a9e2-2cf5-483a-8616-ef7791a98026 #Bryan Ferry
|
85
|
+
http://www.guardian.co.uk/music/foals: 6a65d878-fcd0-42cf-aff9-ca1d636a8bcc #Foals
|
86
|
+
http://www.guardian.co.uk/music/foofighters: 67f66c07-6e61-4026-ade5-7e782fad3a5d #Foo Fighters
|
87
|
+
http://www.guardian.co.uk/music/franzferdinand: aa7a2827-f74b-473c-bd79-03d065835cf7 #Franz Ferdinand
|
88
|
+
http://www.guardian.co.uk/music/futureheads: 0cf9d983-aecf-4d57-9e94-fde2beac014f #The Futureheads
|
89
|
+
http://www.guardian.co.uk/music/gallows: 4cb159df-bd09-4f42-bfec-09b6a248a52f #Gallows
|
90
|
+
http://www.guardian.co.uk/music/gangoffour: d8661c02-f423-4d72-8044-40ff05daf7a1 #Gang of Four
|
91
|
+
http://www.guardian.co.uk/music/georgeharrison: 42a8f507-8412-4611-854f-926571049fa0 #George Harrison
|
92
|
+
http://www.guardian.co.uk/music/georgemichael: ccb8f30e-4d71-40c4-8b1d-846dafe73e2c #George Michael
|
93
|
+
http://www.guardian.co.uk/music/girlsaloud: a0b2f210-cd3a-453d-937d-e4f2658d17c7 #Girls Aloud
|
94
|
+
http://www.guardian.co.uk/music/gnarlsbarkley: a47c3aa2-7d87-475c-a2c7-1e2047dafb09 #Gnarls Barkley
|
95
|
+
http://www.guardian.co.uk/music/gorillaz: e21857d5-3256-4547-afb3-4b6ded592596 #Gorillaz
|
96
|
+
http://www.guardian.co.uk/music/greenday: 084308bd-1654-436f-ba03-df6697104e19 #Green Day
|
97
|
+
http://www.guardian.co.uk/music/grinderman: 611f1230-7c2b-4610-b9e6-5a4fcb6e3cc7 #Grinderman
|
98
|
+
http://www.guardian.co.uk/music/groovearmada: 35723b60-732e-4bd8-957f-320b416e7b7f #Groove Armada
|
99
|
+
http://www.guardian.co.uk/music/gunsnroses: eeb1195b-f213-4ce1-b28c-8565211f8e43 #Guns N' Roses
|
100
|
+
http://www.guardian.co.uk/music/pjharvey: e795e03d-b5d5-4a5f-834d-162cfb308a2c #PJ Harvey
|
101
|
+
http://www.guardian.co.uk/music/isaachayes: a2361c7d-ddb6-41c1-a9a0-be09fbbb8d21 #Isaac Hayes
|
102
|
+
http://www.guardian.co.uk/music/icecube: b1274489-1832-4fd9-a7b6-9414d0b05f62 #Dr. Dre & Ice Cube
|
103
|
+
http://www.guardian.co.uk/music/indigogirls: 00c49f40-d715-4b79-b223-432048602cce #Indigo Girls
|
104
|
+
http://www.guardian.co.uk/music/inxs: 481bf5f9-2e7c-4c44-b08a-05b32bc7c00d #INXS
|
105
|
+
http://www.guardian.co.uk/music/boniver: 437a0e49-c6ae-42f6-a6c1-84f25ed366bc #Bon Iver
|
106
|
+
http://www.guardian.co.uk/music/michaeljackson: f27ec8db-af05-4f36-916e-3d57f91ecf5e #Michael Jackson
|
107
|
+
http://www.guardian.co.uk/music/jam: 23228f18-01d5-493e-94ce-cfcde82a8db2 #The Jam
|
108
|
+
http://www.guardian.co.uk/music/jayz: f82bcf78-5b69-4622-a5ef-73800768d9ac #Jay-Z
|
109
|
+
http://www.guardian.co.uk/music/jesusandmarychain: e938a15c-b17e-4e7a-9f68-ff0d536cab44 #The Jesus and Mary Chain
|
110
|
+
http://www.guardian.co.uk/music/johnfordham: fc3d0f8f-236e-4012-afc3-3002c4d2530c #Debra Fordham
|
111
|
+
http://www.guardian.co.uk/music/johnlennon: 4d5447d7-c61c-4120-ba1b-d7f471d385b9 #John Lennon
|
112
|
+
http://www.guardian.co.uk/music/joydivision: 9a58fda3-f4ed-4080-a3a5-f457aac9fcdd #Joy Division
|
113
|
+
http://www.guardian.co.uk/music/kaiserchiefs: 90218af4-4d58-4821-8d41-2ee295ebbe21 #Kaiser Chiefs
|
114
|
+
http://www.guardian.co.uk/music/kasabian: 69b39eab-6577-46a4-a9f5-817839092033 #Kasabian
|
115
|
+
http://www.guardian.co.uk/music/kelis: 1239d1bc-cc09-43e0-bcd0-374f60346138 #Kelis
|
116
|
+
http://www.guardian.co.uk/music/aliciakeys: e92aa52d-bb07-4ed7-bcca-2c3f7e93d863 #Usher & Alicia Keys
|
117
|
+
http://www.guardian.co.uk/music/rilokiley: eaf6a7ca-105d-4a94-ba02-8c3e4040319a #Rilo Kiley
|
118
|
+
http://www.guardian.co.uk/music/killers: 95e1ead9-4d31-4808-a7ac-32c3614c116b #The Killers
|
119
|
+
http://www.guardian.co.uk/music/kingsofconvenience: cf0f4547-ffbd-4011-98ad-0bec9ba022db #Kings of Convenience
|
120
|
+
http://www.guardian.co.uk/music/kingsofleon: 6ffb8ea9-2370-44d8-b678-e9237bbd347b #Kings of Leon
|
121
|
+
http://www.guardian.co.uk/music/kinks: 17b53d9f-5c63-4a09-a593-dde4608e0db9 #The Kinks
|
122
|
+
http://www.guardian.co.uk/music/kiss: 98b67ebc-5606-4cdb-9787-47b12cceb101 #Kiss
|
123
|
+
http://www.guardian.co.uk/music/alisonkrauss: 6b064ead-91a4-4ac8-8076-b1febe4f4aac #Alison Krauss
|
124
|
+
http://www.guardian.co.uk/music/ledzeppelin: 678d88b2-87b0-403b-b63d-5da7465aecc3 #Led Zeppelin
|
125
|
+
http://www.guardian.co.uk/music/johnlegend: 75a72702-a5ef-4513-bca5-c5b944903546 #John Legend
|
126
|
+
http://www.guardian.co.uk/music/leonalewis: 8d552dfc-648f-401f-90de-e925013ca537 #Leona Lewis
|
127
|
+
http://www.guardian.co.uk/music/libertines: 82b304c0-7da4-45d3-896a-0767c7ae1141 #The Libertines
|
128
|
+
http://www.guardian.co.uk/music/llcoolj: a4dd0e77-83b8-4e92-89b7-effb0e47fd8c #LL Cool J
|
129
|
+
http://www.guardian.co.uk/music/love: 34ec9a8d-c65b-48fd-bcdd-aad2f72fdb47 #Love
|
130
|
+
http://www.guardian.co.uk/music/courtney: 31d2041c-985d-48f7-b6e2-2a70cdf14853 #Courtney Love
|
131
|
+
http://www.guardian.co.uk/music/nicklowe: a3d5a6bf-c86d-44d3-969b-f345620009c6 #Nick Lowe
|
132
|
+
http://www.guardian.co.uk/music/luna: 107ed89a-88ff-4a90-8b75-2619dc7ba950 #Luna-C
|
133
|
+
http://www.guardian.co.uk/music/madonna: 79239441-bfd5-4981-a70c-55c3f15c1287 #Madonna
|
134
|
+
http://www.guardian.co.uk/music/bobmarley: c33627c6-ef0d-49de-9ef0-c4804190040f #Bob Marley
|
135
|
+
http://www.guardian.co.uk/music/johnnymarr: d6ee4ecc-ed0a-42f3-a3c8-d943bd63744f #Johnny Marr + The Healers
|
136
|
+
http://www.guardian.co.uk/music/maximopark: 92e634a7-6023-4be8-be15-ebba822f5b34 #Maxïmo Park
|
137
|
+
http://www.guardian.co.uk/music/paulmccartney: cd7a47b2-bdcb-413f-a939-7d8d55bd6fc3 #Paul McCartney & Linda McCartney
|
138
|
+
http://www.guardian.co.uk/music/metallica: 65f4f0c5-ef9e-490c-aee3-909e7ae6b2ab #Metallica
|
139
|
+
http://www.guardian.co.uk/music/mia: 5f6be871-eb98-42f1-bce4-5a3d8212c281 #M.I.A.
|
140
|
+
http://www.guardian.co.uk/music/michaeljacksontrial: f27ec8db-af05-4f36-916e-3d57f91ecf5e #Michael Jackson
|
141
|
+
http://www.guardian.co.uk/music/kylieminogue: 2fddb92d-24b2-46a5-bf28-3aed46f4684c #Kylie Minogue
|
142
|
+
http://www.guardian.co.uk/music/jonimitchell: a6de8ef9-b1a1-4756-97aa-481bbb8a4069 #Joni Mitchell
|
143
|
+
http://www.guardian.co.uk/music/modestmouse: a96ac800-bfcb-412a-8a63-0a98df600700 #Modest Mouse
|
144
|
+
http://www.guardian.co.uk/music/vanmorrison: a41ac10f-0a56-4672-9161-b83f9b223559 #Van Morrison
|
145
|
+
http://www.guardian.co.uk/music/morrissey: 013fa897-86db-41d3-8e9f-386c8a34f4e6 #Morrissey
|
146
|
+
http://www.guardian.co.uk/music/mozart: b972f589-fb0e-474e-b64a-803b0364fa75 #Wolfgang Amadeus Mozart
|
147
|
+
http://www.guardian.co.uk/music/mudhoney: e675295a-1efe-4247-aa3b-53b78d0cdffc #Mudhoney
|
148
|
+
http://www.guardian.co.uk/music/muse: 9c9f1380-2516-4fc9-a3e6-f9f61941d090 #Muse
|
149
|
+
http://www.guardian.co.uk/music/netmusic: a10ffaf2-0d3f-4d89-bc42-914c36e86f02 #Internet
|
150
|
+
http://www.guardian.co.uk/music/roxymusic: 331ce348-1b08-40b9-8ed7-0763b92bd003 #Roxy Music
|
151
|
+
http://www.guardian.co.uk/music/mychemicalromance: c07f0676-9143-4217-8a9f-4c26bd636f13 #My Chemical Romance
|
152
|
+
http://www.guardian.co.uk/music/neptunes: cc85e0b6-e953-4602-be9c-8c4218e417de #Neptunes
|
153
|
+
http://www.guardian.co.uk/music/nerd: 3fb49f5a-fdc0-4789-9c84-22b38b3f3cb5 #N.E.R.D.
|
154
|
+
http://www.guardian.co.uk/music/neworder: af1995e4-16a1-4c05-9ac9-082edeb3a099 #Kylie Minogue vs. New Order
|
155
|
+
http://www.guardian.co.uk/music/stevienicks: 4b1bf05d-0e3c-44a3-9fc8-ad088fc25797 #Buckingham Nicks
|
156
|
+
http://www.guardian.co.uk/music/nineinchnails: b7ffd2af-418f-4be2-bdd1-22f8b48613da #Nine Inch Nails
|
157
|
+
http://www.guardian.co.uk/music/nirvana: 5b11f4ce-a62d-471e-81fc-a69a8278c7da #Nirvana
|
158
|
+
http://www.guardian.co.uk/music/nodoubt: fbd2a255-1d57-4d31-ac11-65b671c19958 #No Doubt
|
159
|
+
http://www.guardian.co.uk/music/garynuman: 6cb79cb2-9087-44d4-828b-5c6fdff2c957 #Gary Numan
|
160
|
+
http://www.guardian.co.uk/music/nwa: 3a54bffa-2314-44a2-927b-60144119c780 #N.W.A
|
161
|
+
http://www.guardian.co.uk/music/oasis: 39ab1aed-75e0-4140-bd47-540276886b60 #Oasis
|
162
|
+
http://www.guardian.co.uk/music/oldirtybastard: d50548a0-3cfd-4d7a-964b-0aef6545d819 #Ol' Dirty Bastard
|
163
|
+
http://www.guardian.co.uk/music/orbital: f3e2a7d9-c6bb-4848-95e5-04c0a1e2f511 #Orbital
|
164
|
+
http://www.guardian.co.uk/music/ozzyosbourne: 8aa5b65a-5b3c-4029-92bf-47a544356934 #Ozzy Osbourne
|
165
|
+
http://www.guardian.co.uk/music/outkast: 73fdb566-a9b1-494c-9f32-51768ec9fd27 #OutKast
|
166
|
+
http://www.guardian.co.uk/music/panicatthedisco: b9472588-93f3-4922-a1a2-74082cdf9ce8 #Panic at the Disco
|
167
|
+
http://www.guardian.co.uk/music/gramparsons: cbc83fb1-2c23-4ad1-9187-594b30da3f21 #Gram Parsons
|
168
|
+
http://www.guardian.co.uk/music/peaches: 270acfee-1fbe-413e-a0a8-6a35a8b3b66c #Peaches
|
169
|
+
http://www.guardian.co.uk/music/pearljam: 83b9cbe7-9857-49e2-ab8e-b57b01038103 #Pearl Jam
|
170
|
+
http://www.guardian.co.uk/music/johnpeel: 65ddc5dd-6e1c-4f70-bee3-b67703bbf4c8 #John Peel
|
171
|
+
http://www.guardian.co.uk/music/petshopboys: be540c02-7898-4b79-9acc-c8122c7d9e83 #Pet Shop Boys
|
172
|
+
http://www.guardian.co.uk/music/tompetty: f93dbc64-6f08-4033-bcc7-8a0bb4689849 #Tom Petty and The Heartbreakers
|
173
|
+
http://www.guardian.co.uk/music/pinkfloyd: 83d91898-7763-47d7-b03b-b92132375c47 #Pink Floyd
|
174
|
+
http://www.guardian.co.uk/music/pixies: b6b2bb8d-54a9-491f-9607-7b546023b433 #Pixies
|
175
|
+
http://www.guardian.co.uk/music/iggypop: f37b3f31-b1f8-4b88-8cb5-b34f709b17d7 #Iggy Pop
|
176
|
+
http://www.guardian.co.uk/music/portishead: 8f6bd1e4-fbe1-4f50-aa9b-94c450ec0f11 #Portishead
|
177
|
+
http://www.guardian.co.uk/music/elvispresley: 01809552-4f87-45b0-afff-2c6f0730a3be #Elvis Presley
|
178
|
+
http://www.guardian.co.uk/music/primalscream: 55704c38-224f-4b75-b29f-d43653f8bc9a #Primal Scream
|
179
|
+
http://www.guardian.co.uk/music/prince: 070d193a-845c-479f-980e-bef15710653e #Prince
|
180
|
+
http://www.guardian.co.uk/music/prodigy: 4a4ee089-93b1-4470-af9a-6ff575d32704 #The Prodigy
|
181
|
+
http://www.guardian.co.uk/music/pulp: 76b2e842-5e85-4c97-ab62-d5bc315595b5 #Pulp
|
182
|
+
http://www.guardian.co.uk/music/queensofthestoneage: 7dc8f5bd-9d0b-4087-9f73-dc164950bbd8 #Queens of the Stone Age
|
183
|
+
http://www.guardian.co.uk/music/raconteurs: be407b02-f3e6-4ed5-9489-f8e5f0ab36dc #The Raconteurs
|
184
|
+
http://www.guardian.co.uk/music/radiohead: a74b1b7f-71a5-4011-9441-d0b5e4122711 #Radiohead
|
185
|
+
http://www.guardian.co.uk/music/ramones: d6ed7887-a401-47a8-893c-34b967444d26 #Ramones
|
186
|
+
http://www.guardian.co.uk/music/razorlight: f2cb0435-d643-4fab-9587-fdb0279330a7 #Razorlight
|
187
|
+
http://www.guardian.co.uk/music/redhotchilipeppers: 8bfac288-ccc5-448d-9573-c33ea2aa5c30 #Red Hot Chili Peppers
|
188
|
+
http://www.guardian.co.uk/music/keithrichards: f0ed72a3-ae8f-4cf7-b51d-2696a2330230 #Keith Richards
|
189
|
+
http://www.guardian.co.uk/music/santogold: d7311646-287b-4d3a-9a4f-7d46f93075e5 #Santogold
|
190
|
+
http://www.guardian.co.uk/music/sexpistols: e5db18cb-4b1f-496d-a308-548b611090d3 #Sex Pistols
|
191
|
+
http://www.guardian.co.uk/music/simonandgarfunkel: 5d02f264-e225-41ff-83f7-d9b1f0b1874a #Simon & Garfunkel
|
192
|
+
http://www.guardian.co.uk/music/siouxsieandthebanshees: 78ea5ea1-3c4d-4b7e-ac5d-68900319ebe2 #Siouxsie and The Banshees
|
193
|
+
http://www.guardian.co.uk/music/smashingpumpkins: ba0d6274-db14-4ef5-b28d-657ebde1a396 #The Smashing Pumpkins
|
194
|
+
http://www.guardian.co.uk/music/elliottsmith: 03ad1736-b7c9-412a-b442-82536d63a5c4 #Elliott Smith
|
195
|
+
http://www.guardian.co.uk/music/smiths: 40f5d9e4-2de7-4f2d-ad41-e31a9a9fea27 #The Smiths
|
196
|
+
http://www.guardian.co.uk/music/snoopdogg: f90e8b26-9e52-4669-a5c9-e28529c47894 #Snoop Dogg
|
197
|
+
http://www.guardian.co.uk/music/samsparro: cd71e6e9-42bb-4a1a-b5ce-17f41682b3e2 #Sam Sparro
|
198
|
+
http://www.guardian.co.uk/music/britneyspears: 45a663b5-b1cb-4a91-bff6-2bef7bbfdd76 #Britney Spears
|
199
|
+
http://www.guardian.co.uk/music/spicegirls: bf0caafc-2b20-4e07-ab85-87e14ff430ce #Spice Girls
|
200
|
+
http://www.guardian.co.uk/music/spiritualized: 65041e06-83d2-4987-ae52-c17a915fc82a #Spiritualized
|
201
|
+
http://www.guardian.co.uk/music/springsteen: 70248960-cb53-4ea4-943a-edb18f7d336f #Bruce Springsteen
|
202
|
+
http://www.guardian.co.uk/music/gwenstefani: 2e41ae9c-afd2-4f20-8f1e-17281ce9b472 #Gwen Stefani
|
203
|
+
http://www.guardian.co.uk/music/sufjanstevens: 01d3c51b-9b98-418a-8d8e-37f6fab59d8c #Sufjan Stevens
|
204
|
+
http://www.guardian.co.uk/music/davestewart: 4f8df6e2-33dc-4d05-86d9-2f9641c6f4d7 #Dave Stewart & Barbara Gaskin
|
205
|
+
http://www.guardian.co.uk/music/rodstewart: a35237a0-4f47-40a6-b6f3-1e786db23402 #Rod Stewart
|
206
|
+
http://www.guardian.co.uk/music/stoneroses: b5fa29f1-6c22-4321-a488-b5f363b06b06 #The Stone Roses
|
207
|
+
http://www.guardian.co.uk/music/strokes: 16aacd08-a0f3-46c1-b7ec-f1736f5de60d #The Diff'rent Strokes
|
208
|
+
http://www.guardian.co.uk/music/joestrummer: 39c1e474-647e-42ef-a157-fcfb30c2c2ff #Joe Strummer & The Mescaleros
|
209
|
+
http://www.guardian.co.uk/music/supergrass: 6386ddff-0d13-4685-9f0a-a82bf022fb1c #Supergrass
|
210
|
+
http://www.guardian.co.uk/music/hives: 487bfd74-71bf-46dd-b89c-80b7a0f06f2f #The Hives
|
211
|
+
http://www.guardian.co.uk/music/therollingstones: b071f9fa-14b0-4217-8e97-eb41da73f598 #The Rolling Stones
|
212
|
+
http://www.guardian.co.uk/music/thespecials: 07eb40a2-2914-439c-a01d-15a685b84ddf #The Specials
|
213
|
+
http://www.guardian.co.uk/music/thestreets: 0345b1d2-9017-4a97-848e-d5f7d2ea8de6 #One Way Streets
|
214
|
+
http://www.guardian.co.uk/music/who: 9fdaa16b-a6c4-4831-b87c-bc9ca8ce7eaa #The Who
|
215
|
+
http://www.guardian.co.uk/music/timbaland: daa09819-5da5-4c7a-8bef-eb372bb27ff1 #Timbaland
|
216
|
+
http://www.guardian.co.uk/music/justintimberlake: 596ffa74-3d08-44ef-b113-765d43d12738 #Justin Timberlake
|
217
|
+
http://www.guardian.co.uk/music/petetownshend: fb147b8f-0144-4418-acaa-90b2d9779840 #Pete Townshend
|
218
|
+
http://www.guardian.co.uk/music/kttunstall: 951d2103-9c7d-4849-ae60-88bf6aa4790b #KT Tunstall
|
219
|
+
http://www.guardian.co.uk/music/u2: a3cb23fc-acd3-4ce0-8f36-1e5aa6a18432 #U2
|
220
|
+
http://www.guardian.co.uk/music/ub40: 7113aab7-628f-4050-ae49-dbecac110ca8 #UB40
|
221
|
+
http://www.guardian.co.uk/music/urban: ca738bcc-b2ce-4dcc-af52-d1654bfd4733 #Urban Cowboyz
|
222
|
+
http://www.guardian.co.uk/music/velvetrevolver: 3a528006-1429-47f4-ae9b-2ea95343e16a #Velvet Revolver
|
223
|
+
http://www.guardian.co.uk/music/velvetunderground: 94b0fb9d-a066-4823-b2ec-af1d324bcfcf #The Velvet Underground
|
224
|
+
http://www.guardian.co.uk/music/verve: d4d17620-fd97-4574-92a8-a2cb7e72ce42 #The Verve
|
225
|
+
http://www.guardian.co.uk/music/vines: 4e045c96-538b-46ed-8ea8-7cae20b56574 #The Vines
|
226
|
+
http://www.guardian.co.uk/music/marthawainwright: 231475d8-fddb-4d7d-aad9-287e59e4b4ba #Martha Wainwright
|
227
|
+
http://www.guardian.co.uk/music/rufuswainwright: 78e46ae5-9bfd-433b-be3f-19e993d67ecc #Rufus Wainwright
|
228
|
+
http://www.guardian.co.uk/music/mward: 655b3e5b-09e4-45dd-941c-6fa3fc12521b #M. Ward
|
229
|
+
http://www.guardian.co.uk/music/werejammin: 7fa1acb8-4fb9-4947-9040-51a844969834 #Jammin'
|
230
|
+
http://www.guardian.co.uk/music/weezer: 6fe07aa5-fec0-4eca-a456-f29bff451b04 #Weezer
|
231
|
+
http://www.guardian.co.uk/music/paulweller: ac1749b5-088e-4c42-9c39-7f578ff54f6e #Paul Weller vs. Portishead
|
232
|
+
http://www.guardian.co.uk/music/kanyewest: 164f0d73-1234-4e2c-8743-d77bf2191051 #Kanye West
|
233
|
+
http://www.guardian.co.uk/music/thewhitestripes: 11ae9fbb-f3d7-4a47-936f-4c0a04d3b3b5 #The White Stripes
|
234
|
+
http://www.guardian.co.uk/music/wilco: 9e53f84d-ef44-4c16-9677-5fd4d78cbd7d #Wilco
|
235
|
+
http://www.guardian.co.uk/music/pharrellwilliams: 149f91ef-1287-46da-9a8e-87fee02f1471 #Pharrell Williams
|
236
|
+
http://www.guardian.co.uk/music/brianwilson: 9b07fae3-4442-4c40-a9e0-78d3e0540901 #Brian Wilson & Van Dyke Parks
|
237
|
+
http://www.guardian.co.uk/music/tonywilson: c90528f0-75e7-435f-82e8-dfbdcf8824d3 #Tony Wilson
|
238
|
+
http://www.guardian.co.uk/music/amywinehouse: dfe9a7c4-8cf2-47f4-9dcb-d233c2b86ec3 #Amy Winehouse
|
239
|
+
http://www.guardian.co.uk/music/patrickwolf: 4ac4e32b-bd18-402e-adad-ae00e72f8d85 #Patrick Wolf
|
240
|
+
http://www.guardian.co.uk/music/steviewonder: 1ee18fb3-18a6-4c7f-8ba0-bc41cdd0462e #Stevie Wonder
|
241
|
+
http://www.guardian.co.uk/music/wutangclan: 0febdcf7-4e1f-4661-9493-b40427de2c13 #Wu-Tang Clan
|
242
|
+
http://www.guardian.co.uk/music/xtc: 97c86b2c-2765-46a2-aef8-76a7e24c430f #XTC
|
243
|
+
http://www.guardian.co.uk/music/yeahyeahyeahs: 584c04d2-4acc-491b-8a0a-e63133f4bfc4 #Yeah Yeah Yeahs
|
244
|
+
http://www.guardian.co.uk/music/neilyoung: 0f3515b0-75c9-46c9-b26c-4cd05d26eae7 #Neil Young & Crazy Horse
|
245
|
+
http://www.guardian.co.uk/music/thezutons: 6290b769-173d-49d1-990e-660a4e333877 #The Zutons
|
data/lib/link_toad.rb
ADDED
@@ -0,0 +1,61 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'hpricot'
|
3
|
+
require 'open-uri'
|
4
|
+
require 'yaml'
|
5
|
+
|
6
|
+
$: << File.expand_path(File.dirname(__FILE__))
|
7
|
+
|
8
|
+
class LinkToad
|
9
|
+
attr_reader :mapping
|
10
|
+
|
11
|
+
# +mapping+ is a hash mapping from a URL to an identifier
|
12
|
+
# that should be associated with that URL.
|
13
|
+
def initialize(mapping)
|
14
|
+
@mapping = mapping
|
15
|
+
end
|
16
|
+
|
17
|
+
# Returns the identifiers for the document at the given +url+.
|
18
|
+
#
|
19
|
+
# Identifiers are found by looking up links in the document in the +mapping+ hash.
|
20
|
+
def match(url)
|
21
|
+
links = links_from_url(url)
|
22
|
+
links.map { |l| hits_for_uri(l) }.flatten.uniq
|
23
|
+
end
|
24
|
+
|
25
|
+
protected
|
26
|
+
|
27
|
+
def links_from_url(url)
|
28
|
+
doc = Hpricot(open(url))
|
29
|
+
links = []
|
30
|
+
doc.search('//a').each do |link|
|
31
|
+
next if link.attributes['href'].nil?
|
32
|
+
begin
|
33
|
+
uri = URI.parse(link.attributes['href'].strip)
|
34
|
+
rescue URI::InvalidURIError
|
35
|
+
next
|
36
|
+
end
|
37
|
+
next unless (uri.kind_of? URI::HTTP or uri.kind_of? URI::HTTPS)
|
38
|
+
links << uri.to_s
|
39
|
+
end
|
40
|
+
links.uniq
|
41
|
+
end
|
42
|
+
|
43
|
+
def hits_for_uri(uri)
|
44
|
+
# search for gids with both a trailing / and without
|
45
|
+
uri_string = uri.gsub(%r[/$], '')
|
46
|
+
uri_strings = [ uri_string, "#{uri_string}/" ]
|
47
|
+
|
48
|
+
# search for gids with both 'www.' and without
|
49
|
+
if uri_string =~ %r[http://www\.]
|
50
|
+
uri_strings << uri_string.gsub('http://www.', 'http://')
|
51
|
+
else
|
52
|
+
uri_strings << uri_string.gsub('http://', 'http://www.')
|
53
|
+
end
|
54
|
+
uri_strings << "#{uri_strings.last}/"
|
55
|
+
|
56
|
+
# try also without the index.* if it has one
|
57
|
+
uri_strings << uri_string.sub(/index\.\w{3,4}/i, '') if uri_string =~ /index\.\w{3,4}$/i
|
58
|
+
|
59
|
+
uri_strings.map { |u| @mapping[u] }.flatten.compact.uniq
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,10 @@
|
|
1
|
+
class MusicArtistsLinkToad < LinkToad
|
2
|
+
|
3
|
+
protected
|
4
|
+
|
5
|
+
def hits_for_uri(uri)
|
6
|
+
return [$2] if (uri=~%r[http://(www\.)?musicbrainz.org/artist/([-a-f0-9]{36})])
|
7
|
+
return [$2] if (uri=~%r[http://(www\.)?bbc.co.uk/music/artists/([-a-f0-9]{36})])
|
8
|
+
super
|
9
|
+
end
|
10
|
+
end
|
@@ -0,0 +1,81 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), 'spec_helper')
|
2
|
+
|
3
|
+
describe LinkToad do
|
4
|
+
before(:each) do
|
5
|
+
@toad = LinkToad.new({})
|
6
|
+
end
|
7
|
+
|
8
|
+
describe "extracting links" do
|
9
|
+
describe "from an empty doc" do
|
10
|
+
before(:each) do
|
11
|
+
@toad.expects(:open).once.returns('')
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should return an empty array of links" do
|
15
|
+
@toad.send(:links_from_url, 'http://www.foo.com').should == []
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
describe "from a page with one link" do
|
20
|
+
before(:each) do
|
21
|
+
@toad.expects(:open).once.returns('<a href="http://www.foo.com">foo</a>')
|
22
|
+
end
|
23
|
+
|
24
|
+
it "should return that link back" do
|
25
|
+
@toad.send(:links_from_url, 'http://www.foo.com').should == [ 'http://www.foo.com' ]
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
describe "looking up gids for a uri" do
|
31
|
+
before(:each) do
|
32
|
+
@toad.mapping.merge!({
|
33
|
+
'http://www.coldplay.com/' => 'cc197bad-dc9c-440d-a5b5-d52ba2e14234',
|
34
|
+
'http://www.keanemusic.com/' => 'c7020c6d-cae9-4db3-92a7-e5c561cbad50',
|
35
|
+
'http://www.gymclassheroes.com/' => 'f4d4b515-0b74-423f-a161-db184330c37c',
|
36
|
+
'http://www.madonna.com/' => '79239441-bfd5-4981-a70c-55c3f15c1287',
|
37
|
+
'http://www.oasisinet.com/' => '39ab1aed-75e0-4140-bd47-540276886b60',
|
38
|
+
'http://adele.tv/' => 'cc2c9c3c-b7bc-4b8b-84d8-4fbd8779e493',
|
39
|
+
})
|
40
|
+
end
|
41
|
+
|
42
|
+
it "should return Coldplay's GID with an exact URL match" do
|
43
|
+
@toad.send(:hits_for_uri, 'http://www.coldplay.com/').should == ['cc197bad-dc9c-440d-a5b5-d52ba2e14234']
|
44
|
+
end
|
45
|
+
|
46
|
+
it "should return Coldplay's GID without the trailing slash" do
|
47
|
+
@toad.send(:hits_for_uri, 'http://www.coldplay.com').should == ['cc197bad-dc9c-440d-a5b5-d52ba2e14234']
|
48
|
+
end
|
49
|
+
|
50
|
+
it "should return Coldplay's GID with an index.php URL" do
|
51
|
+
@toad.send(:hits_for_uri, 'http://www.coldplay.com/index.php').should == ['cc197bad-dc9c-440d-a5b5-d52ba2e14234']
|
52
|
+
end
|
53
|
+
|
54
|
+
it "should return Coldplay's GID with an index.php URL" do
|
55
|
+
@toad.send(:hits_for_uri, 'http://www.coldplay.com/index.php').should == ['cc197bad-dc9c-440d-a5b5-d52ba2e14234']
|
56
|
+
end
|
57
|
+
|
58
|
+
it "should return Oasis's GID with an index.aspx URL" do
|
59
|
+
@toad.send(:hits_for_uri, 'http://www.oasisinet.com/index.aspx').should == ['39ab1aed-75e0-4140-bd47-540276886b60']
|
60
|
+
@toad.send(:hits_for_uri, 'http://www.oasisinet.com/Index.aspx').should == ['39ab1aed-75e0-4140-bd47-540276886b60']
|
61
|
+
end
|
62
|
+
|
63
|
+
it "should return Madonna's GID with an exact URL match" do
|
64
|
+
@toad.send(:hits_for_uri, 'http://www.madonna.com/').should == ['79239441-bfd5-4981-a70c-55c3f15c1287']
|
65
|
+
end
|
66
|
+
|
67
|
+
it "should return Madonna's GID with URL omitting www." do
|
68
|
+
@toad.send(:hits_for_uri, 'http://madonna.com').should == ['79239441-bfd5-4981-a70c-55c3f15c1287']
|
69
|
+
@toad.send(:hits_for_uri, 'http://madonna.com/').should == ['79239441-bfd5-4981-a70c-55c3f15c1287']
|
70
|
+
end
|
71
|
+
|
72
|
+
it "should return Adele's GID with an exact URL match" do
|
73
|
+
@toad.send(:hits_for_uri, 'http://adele.tv/').should == ['cc2c9c3c-b7bc-4b8b-84d8-4fbd8779e493']
|
74
|
+
end
|
75
|
+
|
76
|
+
it "should return Adele's GID with URL omitting www." do
|
77
|
+
@toad.send(:hits_for_uri, 'http://www.adele.tv').should == ['cc2c9c3c-b7bc-4b8b-84d8-4fbd8779e493']
|
78
|
+
@toad.send(:hits_for_uri, 'http://www.adele.tv/').should == ['cc2c9c3c-b7bc-4b8b-84d8-4fbd8779e493']
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), 'spec_helper')
|
2
|
+
|
3
|
+
require 'link_toad/music_link_toad'
|
4
|
+
|
5
|
+
describe MusicArtistsLinkToad do
|
6
|
+
before(:each) do
|
7
|
+
@toad = MusicArtistsLinkToad.new({})
|
8
|
+
end
|
9
|
+
|
10
|
+
describe "looking up gids for a uri" do
|
11
|
+
|
12
|
+
it "should match a BBC Artist URL" do
|
13
|
+
[ 'http://bbc.co.uk/music/artists/cc197bad-dc9c-440d-a5b5-d52ba2e14234',
|
14
|
+
'http://bbc.co.uk/music/artists/cc197bad-dc9c-440d-a5b5-d52ba2e14234/',
|
15
|
+
'http://www.bbc.co.uk/music/artists/cc197bad-dc9c-440d-a5b5-d52ba2e14234',
|
16
|
+
'http://www.bbc.co.uk/music/artists/cc197bad-dc9c-440d-a5b5-d52ba2e14234/',
|
17
|
+
].each { |uri| @toad.send(:hits_for_uri, uri).should == [ 'cc197bad-dc9c-440d-a5b5-d52ba2e14234' ] }
|
18
|
+
end
|
19
|
+
|
20
|
+
it "should match a MusicBrainz URL" do
|
21
|
+
[ 'http://musicbrainz.org/artist/cc197bad-dc9c-440d-a5b5-d52ba2e14234',
|
22
|
+
'http://musicbrainz.org/artist/cc197bad-dc9c-440d-a5b5-d52ba2e14234/',
|
23
|
+
'http://www.musicbrainz.org/artist/cc197bad-dc9c-440d-a5b5-d52ba2e14234',
|
24
|
+
'http://www.musicbrainz.org/artist/cc197bad-dc9c-440d-a5b5-d52ba2e14234/',
|
25
|
+
].each { |uri| @toad.send(:hits_for_uri, uri).should == [ 'cc197bad-dc9c-440d-a5b5-d52ba2e14234' ] }
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
data/spec/spec_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,74 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: metade-link_toad
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Patrick Sinclair
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2008-10-11 00:00:00 -07:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: hpricot
|
17
|
+
version_requirement:
|
18
|
+
version_requirements: !ruby/object:Gem::Requirement
|
19
|
+
requirements:
|
20
|
+
- - ">"
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: 0.0.0
|
23
|
+
version:
|
24
|
+
description: LinkToad is a general purpose equivalency engine that uses hyperlinks.
|
25
|
+
email: metade@gmail.com
|
26
|
+
executables: []
|
27
|
+
|
28
|
+
extensions: []
|
29
|
+
|
30
|
+
extra_rdoc_files:
|
31
|
+
- README
|
32
|
+
files:
|
33
|
+
- README
|
34
|
+
- lib/link_toad.rb
|
35
|
+
- lib/link_toad/music_link_toad.rb
|
36
|
+
- examples/musicbrainz/guardian/README
|
37
|
+
- examples/musicbrainz/guardian/guardian-music-news.rb
|
38
|
+
- examples/musicbrainz/guardian/guardian-musicbrainz.rb
|
39
|
+
- examples/musicbrainz/guardian/guardian-musicbrainz.yml
|
40
|
+
has_rdoc: true
|
41
|
+
homepage: http://github.com/metade/link_toad
|
42
|
+
post_install_message:
|
43
|
+
rdoc_options:
|
44
|
+
- --main
|
45
|
+
- README
|
46
|
+
- -x
|
47
|
+
- example
|
48
|
+
- -x
|
49
|
+
- spec
|
50
|
+
require_paths:
|
51
|
+
- lib
|
52
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
53
|
+
requirements:
|
54
|
+
- - ">="
|
55
|
+
- !ruby/object:Gem::Version
|
56
|
+
version: "0"
|
57
|
+
version:
|
58
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
59
|
+
requirements:
|
60
|
+
- - ">="
|
61
|
+
- !ruby/object:Gem::Version
|
62
|
+
version: "0"
|
63
|
+
version:
|
64
|
+
requirements: []
|
65
|
+
|
66
|
+
rubyforge_project:
|
67
|
+
rubygems_version: 1.2.0
|
68
|
+
signing_key:
|
69
|
+
specification_version: 2
|
70
|
+
summary: "LinkToad: hyperlink-powered equivalency engine."
|
71
|
+
test_files:
|
72
|
+
- spec/spec_helper.rb
|
73
|
+
- spec/link_toad_spec.rb
|
74
|
+
- spec/music_link_toad_spec.rb
|