music_story 1.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README.txt +13 -0
- data/lib/music_story.rb +25 -0
- data/lib/music_story/model/artist.rb +56 -0
- data/lib/music_story/model/batch.rb +21 -0
- data/lib/music_story/model/genre.rb +7 -0
- data/lib/music_story/repository/artist_sequel.rb +35 -0
- data/lib/music_story/repository/artist_xml_file.rb +113 -0
- data/lib/music_story/repository/batch_sftp.rb +178 -0
- data/lib/music_story/repository/genre_sequel.rb +7 -0
- data/lib/music_story/repository/sequel.rb +66 -0
- data/lib/music_story/utils/html_to_text.rb +30 -0
- data/lib/music_story/utils/xml_to_db_importer.rb +43 -0
- data/lib/music_story/version.rb +4 -0
- metadata +197 -0
data/README.txt
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
Wrapper code for MusicStory data products.
|
2
|
+
|
3
|
+
Usage:
|
4
|
+
|
5
|
+
MusicStory::XMLParser.each_in_file('music-story-data.xml') do |artist|
|
6
|
+
puts artist.name
|
7
|
+
puts artist.plain_text_bio
|
8
|
+
puts artist.main_genres[0].id
|
9
|
+
# see MusicStory::{Artist,Genre} for more available properties
|
10
|
+
end
|
11
|
+
|
12
|
+
Should be able to cope with big XML files, as it uses an XML::Reader to scan through
|
13
|
+
the file one artist at a time; only the current artist object is kept in memory. (Although only tried it on a 330KB file so far so YMMV...)
|
data/lib/music_story.rb
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
module MusicStory
|
2
|
+
module Model; end
|
3
|
+
module Repository; end
|
4
|
+
module Utils; end
|
5
|
+
end
|
6
|
+
|
7
|
+
# alias it
|
8
|
+
Musicstory = MusicStory
|
9
|
+
|
10
|
+
require 'nokogiri'
|
11
|
+
require 'music_story/utils/html_to_text'
|
12
|
+
require 'music_story/utils/xml_to_db_importer'
|
13
|
+
|
14
|
+
require 'thin_models/struct/identity'
|
15
|
+
require 'music_story/model/artist'
|
16
|
+
require 'music_story/model/genre'
|
17
|
+
|
18
|
+
require 'music_story/repository/artist_xml_file'
|
19
|
+
|
20
|
+
require 'sequel'
|
21
|
+
require 'persistence'
|
22
|
+
require 'persistence/sequel'
|
23
|
+
require 'music_story/repository/artist_sequel'
|
24
|
+
require 'music_story/repository/genre_sequel'
|
25
|
+
require 'music_story/repository/sequel'
|
@@ -0,0 +1,56 @@
|
|
1
|
+
module MusicStory
|
2
|
+
class Model::Artist < ThinModels::Struct
|
3
|
+
identity_attribute :id # MusicStory identifier
|
4
|
+
attribute :name # 'nom' at source
|
5
|
+
attribute :forename # 'prenom' at source
|
6
|
+
attribute :real_name # 'nom_reel' at source
|
7
|
+
attribute :role
|
8
|
+
attribute :type
|
9
|
+
attribute :country # 'pays' at source
|
10
|
+
attribute :image_filename
|
11
|
+
|
12
|
+
# Called 'resume' and 'texte_bio' in the MusicStory XML.
|
13
|
+
# Not sure what the appropriate translation for resume vs texte_bio is here,
|
14
|
+
# but in data seen so far they are both the same except that texte_bio has a
|
15
|
+
# credit/copyright line added at the end. Both are given as html, not plain
|
16
|
+
# text. (But see plain_text_{bio,summary})
|
17
|
+
attribute :summary_html
|
18
|
+
attribute :bio_html
|
19
|
+
|
20
|
+
attribute :main_genres
|
21
|
+
attribute :secondary_genres
|
22
|
+
attribute :influenced_by_genres
|
23
|
+
|
24
|
+
def all_genres
|
25
|
+
(main_genres + secondary_genres + influenced_by_genres).uniq
|
26
|
+
end
|
27
|
+
|
28
|
+
attribute :similar_artists
|
29
|
+
attribute :influenced_by_artists
|
30
|
+
|
31
|
+
# 'successor' was MusicStory's English translation, appears to mean 'is succeeded by'
|
32
|
+
# or perhaps more accurately 'influenced' / 'was followed by'. From their example sounds
|
33
|
+
# like it's similar semantics to 'influenced by' but in the opposite direction:
|
34
|
+
#
|
35
|
+
# <associe id_artiste="3795" id_associe="454" nom_associe="Michael Jackson">S</associe>
|
36
|
+
# "id_artist 3795 is Diana Ross et id_associe 454 is Michael Jackson.
|
37
|
+
# The relation means that Micheal Jackson is a successor of Diana Ross.
|
38
|
+
# The reverse isn't not always true, Michael Jackson will not necessarily be mentioned
|
39
|
+
# as influenced by Diana Ross"
|
40
|
+
attribute :successor_artists
|
41
|
+
|
42
|
+
def all_associated_artists
|
43
|
+
(similar_artists + influenced_by_artists + successor_artists).uniq
|
44
|
+
end
|
45
|
+
|
46
|
+
# The bio html converted to plain text, see HTMLToText
|
47
|
+
def plain_text_bio
|
48
|
+
bio_html && Utils::HTMLToText.convert(bio_html)
|
49
|
+
end
|
50
|
+
|
51
|
+
# The summary html converted to plain text, see HTMLToText
|
52
|
+
def plain_text_summary
|
53
|
+
summary_html && Utils::HTMLToText.convert(summary_html)
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module MusicStory::Model
|
2
|
+
class Batch < ThinModels::Struct
|
3
|
+
attribute :path
|
4
|
+
attribute :state
|
5
|
+
|
6
|
+
DATE_PATTERN = /([0-9]{4})\-([0-9]{2})\-([0-9]{2})/
|
7
|
+
|
8
|
+
def date
|
9
|
+
m = DATE_PATTERN.match(File.basename(path))
|
10
|
+
m && Date.new(m[1].to_i, m[2].to_i, m[3].to_i)
|
11
|
+
end
|
12
|
+
|
13
|
+
def to_s
|
14
|
+
"#<Batch path=#{path}>"
|
15
|
+
end
|
16
|
+
|
17
|
+
def ==(rhs)
|
18
|
+
rhs && rhs.is_a?(Batch) && rhs.path == self.path
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module MusicStory
|
2
|
+
class Repository::ArtistSequel < Persistence::Sequel::IdentitySetRepository
|
3
|
+
set_model_class Model::Artist
|
4
|
+
use_table :artists, :id_sequence => true
|
5
|
+
map_column :name
|
6
|
+
map_column :forename
|
7
|
+
map_column :real_name
|
8
|
+
map_column :role
|
9
|
+
map_column :type
|
10
|
+
map_column :country
|
11
|
+
map_column :summary_html
|
12
|
+
map_column :image_filename
|
13
|
+
map_column :bio_html
|
14
|
+
|
15
|
+
[:similar, :influenced_by, :successor].each do |rel|
|
16
|
+
map_many_to_many :"#{rel}_artists",
|
17
|
+
:model_class => Model::Artist,
|
18
|
+
:join_table => :artist_associations,
|
19
|
+
:filter => {:relation => rel.to_s},
|
20
|
+
:left_key => :from_artist_id,
|
21
|
+
:right_key => :to_artist_id,
|
22
|
+
:writeable => true
|
23
|
+
end
|
24
|
+
|
25
|
+
[:main, :secondary, :influenced_by].each do |rel|
|
26
|
+
map_many_to_many :"#{rel}_genres",
|
27
|
+
:model_class => Model::Genre,
|
28
|
+
:join_table => :artist_genres,
|
29
|
+
:filter => {:relation => rel.to_s},
|
30
|
+
:left_key => :artist_id,
|
31
|
+
:right_key => :genre_id,
|
32
|
+
:writeable => true
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,113 @@
|
|
1
|
+
module MusicStory
|
2
|
+
# Parses an XML file of MusicStory artiste objects.
|
3
|
+
# The top-level structure should be
|
4
|
+
# <items>...<artistes><artist>...</artist>...<artist>...</artist></artistes></items>.
|
5
|
+
#
|
6
|
+
# A formal XSD doesn't appear to exist, so this is based entirely on data seen so far,
|
7
|
+
# together with some small pieces of info (such as the ARTIST_GENRE_RELATIONS and
|
8
|
+
# ASSOCIATION_TYPES) gleaned from a brief PDF doc in franglais (descriptionxml_en.pdf).
|
9
|
+
#
|
10
|
+
# Some elements mentioned in the PDF (such as collaboration, album, evenement etc)
|
11
|
+
# haven't been seen so far in artist XML files so aren't handled.
|
12
|
+
class Repository::ArtistXMLFile
|
13
|
+
def initialize(io)
|
14
|
+
@reader = Nokogiri::XML::Reader.from_io(io)
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.new_with_open_file(filename, &block)
|
18
|
+
File.open(filename, 'r') do |file|
|
19
|
+
yield new(file)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
# Codes used in their XML file format:
|
24
|
+
ARTIST_GENRE_RELATIONS = {
|
25
|
+
1 => :main,
|
26
|
+
2 => :secondary,
|
27
|
+
3 => :influenced_by
|
28
|
+
}
|
29
|
+
|
30
|
+
ASSOCIATION_TYPES = {
|
31
|
+
'A' => :similar,
|
32
|
+
'I' => :influenced_by,
|
33
|
+
'S' => :successor
|
34
|
+
}
|
35
|
+
|
36
|
+
include Enumerable
|
37
|
+
def get_all; self; end
|
38
|
+
|
39
|
+
def each
|
40
|
+
@reader.each do |node|
|
41
|
+
next unless node.name == 'artiste' && node.node_type == Nokogiri::XML::Reader::TYPE_ELEMENT
|
42
|
+
doc = Nokogiri::XML(node.outer_xml)
|
43
|
+
|
44
|
+
# extract genres
|
45
|
+
genres = Hash.new {|h,k| h[k]=[]}
|
46
|
+
genres_and_relation = doc.xpath('//artiste/genres/genre').map do |node|
|
47
|
+
genre = Model::Genre.new(
|
48
|
+
:id => to_i_or_nil(node.attr('id')),
|
49
|
+
:name => node.inner_text.strip
|
50
|
+
)
|
51
|
+
[genre, ARTIST_GENRE_RELATIONS[to_i_or_nil(node.attr('relation'))]]
|
52
|
+
end
|
53
|
+
|
54
|
+
genres_and_relation.uniq.each do |genre, relation|
|
55
|
+
genres[relation] << genre
|
56
|
+
end
|
57
|
+
|
58
|
+
# extract associations
|
59
|
+
associations = Hash.new {|h,k| h[k]=[]}
|
60
|
+
associated_artists_and_type = doc.xpath('//artiste/associes/associe').map do |node|
|
61
|
+
artist = Model::Artist.new({
|
62
|
+
:id => to_i_or_nil(node.attr('id_associe')),
|
63
|
+
:name => node.attr('nom_associe')
|
64
|
+
})
|
65
|
+
[artist, ASSOCIATION_TYPES[node.inner_text]]
|
66
|
+
end
|
67
|
+
|
68
|
+
associated_artists_and_type.uniq.each do |artist, type|
|
69
|
+
# FIXME track non-failing errors, rather than keeping quiet about it
|
70
|
+
associations[type] << artist unless invalid_artist?(artist)
|
71
|
+
end
|
72
|
+
|
73
|
+
yield Model::Artist.new({
|
74
|
+
:id => to_i_or_nil(doc.xpath('//artiste').attr('id').value),
|
75
|
+
:name => doc.xpath('//artiste/nom').inner_text,
|
76
|
+
:forename => unless_empty(doc.xpath('//artiste/prenom').inner_text),
|
77
|
+
:real_name => unless_empty(doc.xpath('//artiste/nom_reel').inner_text),
|
78
|
+
:role => unless_empty(doc.xpath('//artiste/role').inner_text),
|
79
|
+
:type => unless_empty(doc.xpath('//artiste/type').inner_text),
|
80
|
+
:country => unless_empty(doc.xpath('//artiste/pays').inner_text),
|
81
|
+
# not sure what the appropriate translation for resume vs texte_bio is here,
|
82
|
+
# but in data seen so far they are both the same and both HTML not plain text:
|
83
|
+
:summary_html => unless_empty(doc.xpath('//artiste/resume').inner_text),
|
84
|
+
:image_filename => unless_empty(doc.xpath('//artiste/image').inner_text),
|
85
|
+
|
86
|
+
:bio_html => unless_empty(doc.xpath('//artiste/texte_bio').inner_text),
|
87
|
+
:main_genres => genres[:main],
|
88
|
+
:secondary_genres => genres[:secondary],
|
89
|
+
:influenced_by_genres => genres[:influenced_by],
|
90
|
+
:similar_artists => associations[:similar],
|
91
|
+
:influenced_by_artists => associations[:influenced_by],
|
92
|
+
:successor_artists => associations[:successor]
|
93
|
+
})
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
private
|
98
|
+
def unless_empty(string)
|
99
|
+
string = string.strip
|
100
|
+
string unless string.empty?
|
101
|
+
end
|
102
|
+
|
103
|
+
# basic check that core artist properties are there and correct
|
104
|
+
def invalid_artist?(artist)
|
105
|
+
artist.name.nil? || artist.name.strip.empty? ||
|
106
|
+
artist.id.nil? || /[0-9]+/.match(artist.id.to_s).nil?
|
107
|
+
end
|
108
|
+
|
109
|
+
def to_i_or_nil(value)
|
110
|
+
/[0-9]+/.match(value.to_s) && value.to_i
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
@@ -0,0 +1,178 @@
|
|
1
|
+
require 'net/sftp'
|
2
|
+
require 'music_story/model/batch'
|
3
|
+
|
4
|
+
module MusicStory
|
5
|
+
|
6
|
+
# Gives access to whole batches of music story data that are kept somewhere
|
7
|
+
# on an sftp site somwhere that music story can constantly deliver new data.
|
8
|
+
# Once downloaded, this data can be accessed using the xml repository, or
|
9
|
+
# imported in to a local database
|
10
|
+
class Repository::BatchSFTP
|
11
|
+
|
12
|
+
# the presence of this file inside a batch directory tells us that the sender
|
13
|
+
# has finished sending it
|
14
|
+
DELIVERY_COMPLETE = 'delivery.complete'
|
15
|
+
|
16
|
+
# some sub dirs we use to manage the flow of data
|
17
|
+
DIR_PROCESSING = 'processing'
|
18
|
+
DIR_PROCESSED = 'processed'
|
19
|
+
|
20
|
+
# memoised flag from sftp rename methods
|
21
|
+
RENAME_NATIVE = Net::SFTP::Constants::RenameFlags::NATIVE
|
22
|
+
|
23
|
+
def initialize(host, username, options={})
|
24
|
+
@host = host
|
25
|
+
@username = username
|
26
|
+
@options = options
|
27
|
+
@basedir = options[:basedir] || '/'
|
28
|
+
@batch_pattern = options[:batch_pattern] || 'music-story-data-*'
|
29
|
+
@logger = options[:logger] || Logger.new('/dev/null')
|
30
|
+
end
|
31
|
+
|
32
|
+
# start talking to the remote server, yielding the session to the block,
|
33
|
+
# which is closed after the block finishes executing.
|
34
|
+
# The block is yielded a wrapper object that lets you use the access methods
|
35
|
+
# in the repository, minus the first argument, for instance:
|
36
|
+
# repo.connect do |session|
|
37
|
+
# batch = session.new_batches.first
|
38
|
+
# session.download(batch, '/tmp/dir')
|
39
|
+
# end
|
40
|
+
def connect(&block)
|
41
|
+
return_result = nil
|
42
|
+
# the sftp.start method does not seem to return the last thing you execute
|
43
|
+
start_sftp_session do |sftp_session|
|
44
|
+
return_result = yield SessionWrapper.new(self, sftp_session)
|
45
|
+
end
|
46
|
+
return_result
|
47
|
+
end
|
48
|
+
|
49
|
+
def start_sftp_session(&block)
|
50
|
+
cnx_options = (@options[:net_sftp_options] || {}).
|
51
|
+
merge(:password => @options[:password])
|
52
|
+
|
53
|
+
@logger.info("Starting sftp session to '#{@host}'")
|
54
|
+
Net::SFTP.start(@host, @username, cnx_options) do |sftp_session|
|
55
|
+
block.call(sftp_session)
|
56
|
+
end.tap do
|
57
|
+
@logger.info("Finished sftp session to '#{@host}'")
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
#
|
62
|
+
# the following methods should be accessed by using connect, and not directly
|
63
|
+
#
|
64
|
+
|
65
|
+
# return a list of batches on the sftp that are in the processed state, i.e
|
66
|
+
# live in the `processed` directory
|
67
|
+
def processed_batches(w)
|
68
|
+
dir = join(@basedir, DIR_PROCESSED)
|
69
|
+
w.sftp.dir[dir, '*'].map do |entry|
|
70
|
+
Model::Batch.new(:path => join(dir, entry.name), :state => :processed)
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
# return a list of batches on the sftp site that are in the processing
|
75
|
+
# state, i.e live in the `processing` directory
|
76
|
+
def processing_batches(w)
|
77
|
+
dir = join(@basedir, DIR_PROCESSING)
|
78
|
+
w.sftp.dir[dir, '*'].map do |entry|
|
79
|
+
Model::Batch.new(:path => join(dir, entry.name), :state => :processing)
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
# return a list of all the batches on the sftp site that are ready to
|
84
|
+
# be downloaded or we can start processing them
|
85
|
+
def new_batches(w)
|
86
|
+
@logger.debug("Looking for new batches in remote dir '#@basedir' with pattern #@batch_pattern")
|
87
|
+
complete_dirs = w.sftp.dir[@basedir, @batch_pattern].select do |entry|
|
88
|
+
next if /\.log$/.match(entry.name) # skip log files (MSP#1915)
|
89
|
+
|
90
|
+
w.sftp.dir[join(@basedir, entry.name), DELIVERY_COMPLETE].any?.tap do |f|
|
91
|
+
if f
|
92
|
+
@logger.debug(" Found new batch: #{entry.name}")
|
93
|
+
else
|
94
|
+
@logger.debug(" Incomplete batch: #{entry.name}")
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
complete_dirs.map do |entry|
|
100
|
+
Model::Batch.new(:path => join(@basedir, entry.name),
|
101
|
+
:state => :new)
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
# download a batch. Should work for a batch in any state
|
106
|
+
def download(w, batch, local_dir)
|
107
|
+
@logger.info("Downloading #{batch.path} to #{local_dir}...")
|
108
|
+
w.sftp.download!(batch.path, local_dir, :recursive => true) do |event, downloader, *args|
|
109
|
+
case event
|
110
|
+
when :open then
|
111
|
+
# args[0] : file metadata
|
112
|
+
@logger.debug "Starting download: #{args[0].remote} -> #{args[0].local} (#{args[0].size}) bytes"
|
113
|
+
when :close then
|
114
|
+
# args[0] : file metadata
|
115
|
+
@logger.debug "Finished download: #{args[0].remote}"
|
116
|
+
when :finish then
|
117
|
+
@logger.debug "Download complete"
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
# return true if there are any batches available
|
123
|
+
def new_available?(w)
|
124
|
+
new_batches(w).any?
|
125
|
+
end
|
126
|
+
|
127
|
+
# move a batch in to the processing state, moving its location on the remote
|
128
|
+
# fs
|
129
|
+
def mark_processing(w, batch)
|
130
|
+
new_name = join(@basedir, DIR_PROCESSING, File.basename(batch.path))
|
131
|
+
@logger.info("Marking #{batch} as processing")
|
132
|
+
@logger.debug(" moving #{batch.path} to #{new_name}")
|
133
|
+
w.sftp.rename(batch.path, new_name, RENAME_NATIVE)
|
134
|
+
batch.path = new_name
|
135
|
+
batch.state = :processing
|
136
|
+
end
|
137
|
+
|
138
|
+
# move a batch in to the processed state, moving its location on the remote
|
139
|
+
# fs
|
140
|
+
def mark_processed(w, batch, path_to_logfile=nil)
|
141
|
+
batch_basename = File.basename(batch.path)
|
142
|
+
new_name = join(@basedir, DIR_PROCESSED, batch_basename)
|
143
|
+
@logger.info("Marking #{batch} as processed")
|
144
|
+
@logger.debug(" moving #{batch.path} to #{new_name}")
|
145
|
+
w.sftp.rename(batch.path, new_name, RENAME_NATIVE)
|
146
|
+
batch.path = new_name
|
147
|
+
batch.state = :processed
|
148
|
+
|
149
|
+
if path_to_logfile
|
150
|
+
remote_logfile_path = join(@basedir, batch_basename + '.log')
|
151
|
+
uploader = w.sftp.upload!(path_to_logfile, remote_logfile_path)
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
private
|
156
|
+
|
157
|
+
# less chars ftw
|
158
|
+
def join(*args) ; File.join(*args) ; end
|
159
|
+
end
|
160
|
+
|
161
|
+
class Repository::BatchSFTP::SessionWrapper
|
162
|
+
|
163
|
+
attr_reader :sftp
|
164
|
+
|
165
|
+
def initialize(repository, sftp_session)
|
166
|
+
@repository = repository
|
167
|
+
@sftp = sftp_session
|
168
|
+
end
|
169
|
+
|
170
|
+
def method_missing(name, *args, &block)
|
171
|
+
@repository.send(name, *([self] + args), &block)
|
172
|
+
end
|
173
|
+
|
174
|
+
def respond_to?(name)
|
175
|
+
@repository.respond_to?(name)
|
176
|
+
end
|
177
|
+
end
|
178
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
module MusicStory
|
2
|
+
class Repository::Sequel
|
3
|
+
def initialize(db)
|
4
|
+
db = Sequel.connect(db) unless db.is_a?(Sequel::Database)
|
5
|
+
@db = db
|
6
|
+
@artist_repo = Repository::ArtistSequel.new(db)
|
7
|
+
@genre_repo = Repository::GenreSequel.new(db)
|
8
|
+
[:similar_artists, :influenced_by_artists, :successor_artists].each do |prop|
|
9
|
+
@artist_repo.mapper(prop).target_repo = @artist_repo
|
10
|
+
end
|
11
|
+
[:main_genres, :secondary_genres, :influenced_by_genres].each do |prop|
|
12
|
+
@artist_repo.mapper(prop).target_repo = @genre_repo
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
attr_reader :artist_repo, :genre_repo, :db
|
17
|
+
|
18
|
+
def drop_tables!
|
19
|
+
[:genres, :artist_associations, :artist_genres, :artists].each do |table|
|
20
|
+
begin ; @db.drop_table(table) ; rescue ; end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def create_tables!
|
25
|
+
@db.create_table(:artist_associations, :ignore_index_errors=>true) do
|
26
|
+
Integer :from_artist_id, :null=>false
|
27
|
+
Integer :to_artist_id, :null=>false
|
28
|
+
String :relation, :size=>16
|
29
|
+
|
30
|
+
primary_key [:from_artist_id, :relation, :to_artist_id]
|
31
|
+
|
32
|
+
index [:to_artist_id], :name=>:to_artist_id
|
33
|
+
end
|
34
|
+
|
35
|
+
@db.create_table(:artist_genres, :ignore_index_errors=>true) do
|
36
|
+
Integer :artist_id, :null=>false
|
37
|
+
Integer :genre_id, :null=>false
|
38
|
+
String :relation, :size=>16
|
39
|
+
|
40
|
+
primary_key [:artist_id, :relation, :genre_id]
|
41
|
+
|
42
|
+
index [:genre_id], :name=>:genre_id
|
43
|
+
end
|
44
|
+
|
45
|
+
@db.create_table(:artists, :ignore_index_errors=>true) do
|
46
|
+
primary_key :id
|
47
|
+
String :name, :null=>false, :size=>255
|
48
|
+
String :forename, :size=>255
|
49
|
+
String :real_name, :size=>255
|
50
|
+
String :role, :size=>64
|
51
|
+
String :type, :size=>64
|
52
|
+
String :country, :size=>64
|
53
|
+
String :summary_html, :text=>true
|
54
|
+
String :bio_html, :text=>true
|
55
|
+
String :image_filename, :text=>true
|
56
|
+
|
57
|
+
index [:name], :name=>:name
|
58
|
+
end
|
59
|
+
|
60
|
+
@db.create_table(:genres) do
|
61
|
+
primary_key :id
|
62
|
+
String :name, :null=>false, :size=>255
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module MusicStory
|
2
|
+
# Converts HTML to plain text, converting <br>'s into newlines but
|
3
|
+
# stripping all other tags.
|
4
|
+
# May want to add support for other things like <p> into \n\n if they
|
5
|
+
# crop up; MusicStory only seems to use <br> though
|
6
|
+
class Utils::HTMLToText < Nokogiri::XML::SAX::Document
|
7
|
+
def self.convert(html)
|
8
|
+
doc = new
|
9
|
+
Nokogiri::HTML::SAX::Parser.new(doc).parse(html)
|
10
|
+
doc.to_s
|
11
|
+
end
|
12
|
+
|
13
|
+
def initialize
|
14
|
+
@result = ''
|
15
|
+
end
|
16
|
+
|
17
|
+
def characters(string)
|
18
|
+
@result << string
|
19
|
+
end
|
20
|
+
alias :cdata_block :characters
|
21
|
+
|
22
|
+
def start_element(name, attributes=nil)
|
23
|
+
@result << "\n" if name.downcase == 'br'
|
24
|
+
end
|
25
|
+
|
26
|
+
def to_s
|
27
|
+
@result.strip
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
module MusicStory
|
2
|
+
class Utils::XMLToDBImporter
|
3
|
+
def self.import_file_into_db(filename, db, create_tables=false)
|
4
|
+
sequel_repos = Repository::Sequel.new(db)
|
5
|
+
sequel_repos.create_tables! if create_tables
|
6
|
+
results = Repository::ArtistXMLFile.new_with_open_file(filename) do |xml_repo|
|
7
|
+
new(xml_repo, sequel_repos).import
|
8
|
+
end
|
9
|
+
|
10
|
+
return results.merge(:sequel_repos => sequel_repos)
|
11
|
+
end
|
12
|
+
|
13
|
+
def initialize(xml_repo, sequel_repos)
|
14
|
+
@xml_repo = xml_repo
|
15
|
+
@sequel_repos = sequel_repos
|
16
|
+
end
|
17
|
+
|
18
|
+
def import
|
19
|
+
failures = []
|
20
|
+
successes = []
|
21
|
+
|
22
|
+
@xml_repo.each do |artist|
|
23
|
+
begin
|
24
|
+
@sequel_repos.artist_repo.transaction do
|
25
|
+
artist.all_associated_artists.each do |a|
|
26
|
+
@sequel_repos.artist_repo.store(a)
|
27
|
+
end
|
28
|
+
artist.all_genres.each do |g|
|
29
|
+
@sequel_repos.genre_repo.store(g)
|
30
|
+
end
|
31
|
+
@sequel_repos.artist_repo.store(artist)
|
32
|
+
end
|
33
|
+
rescue => e
|
34
|
+
failures << [artist, e]
|
35
|
+
end
|
36
|
+
|
37
|
+
successes << artist
|
38
|
+
end
|
39
|
+
|
40
|
+
{:successes => successes, :failures => failures}
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
metadata
ADDED
@@ -0,0 +1,197 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: music_story
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 19
|
5
|
+
prerelease:
|
6
|
+
segments:
|
7
|
+
- 1
|
8
|
+
- 0
|
9
|
+
- 2
|
10
|
+
version: 1.0.2
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- Matthew Willson
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2012-10-03 00:00:00 Z
|
19
|
+
dependencies:
|
20
|
+
- !ruby/object:Gem::Dependency
|
21
|
+
name: sequel
|
22
|
+
prerelease: false
|
23
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
24
|
+
none: false
|
25
|
+
requirements:
|
26
|
+
- - ~>
|
27
|
+
- !ruby/object:Gem::Version
|
28
|
+
hash: 43
|
29
|
+
segments:
|
30
|
+
- 3
|
31
|
+
- 11
|
32
|
+
- 0
|
33
|
+
version: 3.11.0
|
34
|
+
type: :runtime
|
35
|
+
version_requirements: *id001
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: persistence
|
38
|
+
prerelease: false
|
39
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
40
|
+
none: false
|
41
|
+
requirements:
|
42
|
+
- - ~>
|
43
|
+
- !ruby/object:Gem::Version
|
44
|
+
hash: 17
|
45
|
+
segments:
|
46
|
+
- 0
|
47
|
+
- 3
|
48
|
+
- 1
|
49
|
+
version: 0.3.1
|
50
|
+
type: :runtime
|
51
|
+
version_requirements: *id002
|
52
|
+
- !ruby/object:Gem::Dependency
|
53
|
+
name: thin_models
|
54
|
+
prerelease: false
|
55
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
56
|
+
none: false
|
57
|
+
requirements:
|
58
|
+
- - ~>
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
hash: 19
|
61
|
+
segments:
|
62
|
+
- 0
|
63
|
+
- 1
|
64
|
+
- 4
|
65
|
+
version: 0.1.4
|
66
|
+
type: :runtime
|
67
|
+
version_requirements: *id003
|
68
|
+
- !ruby/object:Gem::Dependency
|
69
|
+
name: nokogiri
|
70
|
+
prerelease: false
|
71
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
72
|
+
none: false
|
73
|
+
requirements:
|
74
|
+
- - ~>
|
75
|
+
- !ruby/object:Gem::Version
|
76
|
+
hash: 3
|
77
|
+
segments:
|
78
|
+
- 1
|
79
|
+
- 5
|
80
|
+
- 0
|
81
|
+
version: 1.5.0
|
82
|
+
type: :runtime
|
83
|
+
version_requirements: *id004
|
84
|
+
- !ruby/object:Gem::Dependency
|
85
|
+
name: net-sftp
|
86
|
+
prerelease: false
|
87
|
+
requirement: &id005 !ruby/object:Gem::Requirement
|
88
|
+
none: false
|
89
|
+
requirements:
|
90
|
+
- - ">="
|
91
|
+
- !ruby/object:Gem::Version
|
92
|
+
hash: 3
|
93
|
+
segments:
|
94
|
+
- 0
|
95
|
+
version: "0"
|
96
|
+
type: :runtime
|
97
|
+
version_requirements: *id005
|
98
|
+
- !ruby/object:Gem::Dependency
|
99
|
+
name: minitest
|
100
|
+
prerelease: false
|
101
|
+
requirement: &id006 !ruby/object:Gem::Requirement
|
102
|
+
none: false
|
103
|
+
requirements:
|
104
|
+
- - ">="
|
105
|
+
- !ruby/object:Gem::Version
|
106
|
+
hash: 3
|
107
|
+
segments:
|
108
|
+
- 0
|
109
|
+
version: "0"
|
110
|
+
type: :development
|
111
|
+
version_requirements: *id006
|
112
|
+
- !ruby/object:Gem::Dependency
|
113
|
+
name: sqlite3
|
114
|
+
prerelease: false
|
115
|
+
requirement: &id007 !ruby/object:Gem::Requirement
|
116
|
+
none: false
|
117
|
+
requirements:
|
118
|
+
- - ">="
|
119
|
+
- !ruby/object:Gem::Version
|
120
|
+
hash: 3
|
121
|
+
segments:
|
122
|
+
- 0
|
123
|
+
version: "0"
|
124
|
+
type: :development
|
125
|
+
version_requirements: *id007
|
126
|
+
- !ruby/object:Gem::Dependency
|
127
|
+
name: mock_sftp
|
128
|
+
prerelease: false
|
129
|
+
requirement: &id008 !ruby/object:Gem::Requirement
|
130
|
+
none: false
|
131
|
+
requirements:
|
132
|
+
- - ">="
|
133
|
+
- !ruby/object:Gem::Version
|
134
|
+
hash: 3
|
135
|
+
segments:
|
136
|
+
- 0
|
137
|
+
version: "0"
|
138
|
+
type: :development
|
139
|
+
version_requirements: *id008
|
140
|
+
description:
|
141
|
+
email:
|
142
|
+
- matthew@playlouder.com
|
143
|
+
executables: []
|
144
|
+
|
145
|
+
extensions: []
|
146
|
+
|
147
|
+
extra_rdoc_files: []
|
148
|
+
|
149
|
+
files:
|
150
|
+
- lib/music_story/version.rb
|
151
|
+
- lib/music_story/model/genre.rb
|
152
|
+
- lib/music_story/model/batch.rb
|
153
|
+
- lib/music_story/model/artist.rb
|
154
|
+
- lib/music_story/utils/html_to_text.rb
|
155
|
+
- lib/music_story/utils/xml_to_db_importer.rb
|
156
|
+
- lib/music_story/repository/sequel.rb
|
157
|
+
- lib/music_story/repository/artist_xml_file.rb
|
158
|
+
- lib/music_story/repository/batch_sftp.rb
|
159
|
+
- lib/music_story/repository/artist_sequel.rb
|
160
|
+
- lib/music_story/repository/genre_sequel.rb
|
161
|
+
- lib/music_story.rb
|
162
|
+
- README.txt
|
163
|
+
homepage:
|
164
|
+
licenses: []
|
165
|
+
|
166
|
+
post_install_message:
|
167
|
+
rdoc_options: []
|
168
|
+
|
169
|
+
require_paths:
|
170
|
+
- lib
|
171
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
172
|
+
none: false
|
173
|
+
requirements:
|
174
|
+
- - ">="
|
175
|
+
- !ruby/object:Gem::Version
|
176
|
+
hash: 3
|
177
|
+
segments:
|
178
|
+
- 0
|
179
|
+
version: "0"
|
180
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
181
|
+
none: false
|
182
|
+
requirements:
|
183
|
+
- - ">="
|
184
|
+
- !ruby/object:Gem::Version
|
185
|
+
hash: 3
|
186
|
+
segments:
|
187
|
+
- 0
|
188
|
+
version: "0"
|
189
|
+
requirements: []
|
190
|
+
|
191
|
+
rubyforge_project:
|
192
|
+
rubygems_version: 1.8.24
|
193
|
+
signing_key:
|
194
|
+
specification_version: 3
|
195
|
+
summary: Wrapper code for the MusicStory data product
|
196
|
+
test_files: []
|
197
|
+
|