music_story 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.txt +13 -0
- data/lib/music_story.rb +25 -0
- data/lib/music_story/model/artist.rb +56 -0
- data/lib/music_story/model/batch.rb +21 -0
- data/lib/music_story/model/genre.rb +7 -0
- data/lib/music_story/repository/artist_sequel.rb +35 -0
- data/lib/music_story/repository/artist_xml_file.rb +113 -0
- data/lib/music_story/repository/batch_sftp.rb +178 -0
- data/lib/music_story/repository/genre_sequel.rb +7 -0
- data/lib/music_story/repository/sequel.rb +66 -0
- data/lib/music_story/utils/html_to_text.rb +30 -0
- data/lib/music_story/utils/xml_to_db_importer.rb +43 -0
- data/lib/music_story/version.rb +4 -0
- metadata +197 -0
data/README.txt
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
Wrapper code for MusicStory data products.
|
2
|
+
|
3
|
+
Usage:
|
4
|
+
|
5
|
+
MusicStory::XMLParser.each_in_file('music-story-data.xml') do |artist|
|
6
|
+
puts artist.name
|
7
|
+
puts artist.plain_text_bio
|
8
|
+
puts artist.main_genres[0].id
|
9
|
+
# see MusicStory::{Artist,Genre} for more available properties
|
10
|
+
end
|
11
|
+
|
12
|
+
Should be able to cope with big XML files, as it uses an XML::Reader to scan through
|
13
|
+
the file one artist at a time; only the current artist object is kept in memory. (Although only tried it on a 330KB file so far so YMMV...)
|
data/lib/music_story.rb
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
module MusicStory
|
2
|
+
module Model; end
|
3
|
+
module Repository; end
|
4
|
+
module Utils; end
|
5
|
+
end
|
6
|
+
|
7
|
+
# alias it
|
8
|
+
Musicstory = MusicStory
|
9
|
+
|
10
|
+
require 'nokogiri'
|
11
|
+
require 'music_story/utils/html_to_text'
|
12
|
+
require 'music_story/utils/xml_to_db_importer'
|
13
|
+
|
14
|
+
require 'thin_models/struct/identity'
|
15
|
+
require 'music_story/model/artist'
|
16
|
+
require 'music_story/model/genre'
|
17
|
+
|
18
|
+
require 'music_story/repository/artist_xml_file'
|
19
|
+
|
20
|
+
require 'sequel'
|
21
|
+
require 'persistence'
|
22
|
+
require 'persistence/sequel'
|
23
|
+
require 'music_story/repository/artist_sequel'
|
24
|
+
require 'music_story/repository/genre_sequel'
|
25
|
+
require 'music_story/repository/sequel'
|
@@ -0,0 +1,56 @@
|
|
1
|
+
module MusicStory
|
2
|
+
class Model::Artist < ThinModels::Struct
|
3
|
+
identity_attribute :id # MusicStory identifier
|
4
|
+
attribute :name # 'nom' at source
|
5
|
+
attribute :forename # 'prenom' at source
|
6
|
+
attribute :real_name # 'nom_reel' at source
|
7
|
+
attribute :role
|
8
|
+
attribute :type
|
9
|
+
attribute :country # 'pays' at source
|
10
|
+
attribute :image_filename
|
11
|
+
|
12
|
+
# Called 'resume' and 'texte_bio' in the MusicStory XML.
|
13
|
+
# Not sure what the appropriate translation for resume vs texte_bio is here,
|
14
|
+
# but in data seen so far they are both the same except that texte_bio has a
|
15
|
+
# credit/copyright line added at the end. Both are given as html, not plain
|
16
|
+
# text. (But see plain_text_{bio,summary})
|
17
|
+
attribute :summary_html
|
18
|
+
attribute :bio_html
|
19
|
+
|
20
|
+
attribute :main_genres
|
21
|
+
attribute :secondary_genres
|
22
|
+
attribute :influenced_by_genres
|
23
|
+
|
24
|
+
def all_genres
|
25
|
+
(main_genres + secondary_genres + influenced_by_genres).uniq
|
26
|
+
end
|
27
|
+
|
28
|
+
attribute :similar_artists
|
29
|
+
attribute :influenced_by_artists
|
30
|
+
|
31
|
+
# 'successor' was MusicStory's English translation, appears to mean 'is succeeded by'
|
32
|
+
# or perhaps more accurately 'influenced' / 'was followed by'. From their example sounds
|
33
|
+
# like it's similar semantics to 'influenced by' but in the opposite direction:
|
34
|
+
#
|
35
|
+
# <associe id_artiste="3795" id_associe="454" nom_associe="Michael Jackson">S</associe>
|
36
|
+
# "id_artist 3795 is Diana Ross et id_associe 454 is Michael Jackson.
|
37
|
+
# The relation means that Micheal Jackson is a successor of Diana Ross.
|
38
|
+
# The reverse isn't not always true, Michael Jackson will not necessarily be mentioned
|
39
|
+
# as influenced by Diana Ross"
|
40
|
+
attribute :successor_artists
|
41
|
+
|
42
|
+
def all_associated_artists
|
43
|
+
(similar_artists + influenced_by_artists + successor_artists).uniq
|
44
|
+
end
|
45
|
+
|
46
|
+
# The bio html converted to plain text, see HTMLToText
|
47
|
+
def plain_text_bio
|
48
|
+
bio_html && Utils::HTMLToText.convert(bio_html)
|
49
|
+
end
|
50
|
+
|
51
|
+
# The summary html converted to plain text, see HTMLToText
|
52
|
+
def plain_text_summary
|
53
|
+
summary_html && Utils::HTMLToText.convert(summary_html)
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module MusicStory::Model
|
2
|
+
class Batch < ThinModels::Struct
|
3
|
+
attribute :path
|
4
|
+
attribute :state
|
5
|
+
|
6
|
+
DATE_PATTERN = /([0-9]{4})\-([0-9]{2})\-([0-9]{2})/
|
7
|
+
|
8
|
+
def date
|
9
|
+
m = DATE_PATTERN.match(File.basename(path))
|
10
|
+
m && Date.new(m[1].to_i, m[2].to_i, m[3].to_i)
|
11
|
+
end
|
12
|
+
|
13
|
+
def to_s
|
14
|
+
"#<Batch path=#{path}>"
|
15
|
+
end
|
16
|
+
|
17
|
+
def ==(rhs)
|
18
|
+
rhs && rhs.is_a?(Batch) && rhs.path == self.path
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module MusicStory
|
2
|
+
class Repository::ArtistSequel < Persistence::Sequel::IdentitySetRepository
|
3
|
+
set_model_class Model::Artist
|
4
|
+
use_table :artists, :id_sequence => true
|
5
|
+
map_column :name
|
6
|
+
map_column :forename
|
7
|
+
map_column :real_name
|
8
|
+
map_column :role
|
9
|
+
map_column :type
|
10
|
+
map_column :country
|
11
|
+
map_column :summary_html
|
12
|
+
map_column :image_filename
|
13
|
+
map_column :bio_html
|
14
|
+
|
15
|
+
[:similar, :influenced_by, :successor].each do |rel|
|
16
|
+
map_many_to_many :"#{rel}_artists",
|
17
|
+
:model_class => Model::Artist,
|
18
|
+
:join_table => :artist_associations,
|
19
|
+
:filter => {:relation => rel.to_s},
|
20
|
+
:left_key => :from_artist_id,
|
21
|
+
:right_key => :to_artist_id,
|
22
|
+
:writeable => true
|
23
|
+
end
|
24
|
+
|
25
|
+
[:main, :secondary, :influenced_by].each do |rel|
|
26
|
+
map_many_to_many :"#{rel}_genres",
|
27
|
+
:model_class => Model::Genre,
|
28
|
+
:join_table => :artist_genres,
|
29
|
+
:filter => {:relation => rel.to_s},
|
30
|
+
:left_key => :artist_id,
|
31
|
+
:right_key => :genre_id,
|
32
|
+
:writeable => true
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,113 @@
|
|
1
|
+
module MusicStory
|
2
|
+
# Parses an XML file of MusicStory artiste objects.
|
3
|
+
# The top-level structure should be
|
4
|
+
# <items>...<artistes><artist>...</artist>...<artist>...</artist></artistes></items>.
|
5
|
+
#
|
6
|
+
# A formal XSD doesn't appear to exist, so this is based entirely on data seen so far,
|
7
|
+
# together with some small pieces of info (such as the ARTIST_GENRE_RELATIONS and
|
8
|
+
# ASSOCIATION_TYPES) gleaned from a brief PDF doc in franglais (descriptionxml_en.pdf).
|
9
|
+
#
|
10
|
+
# Some elements mentioned in the PDF (such as collaboration, album, evenement etc)
|
11
|
+
# haven't been seen so far in artist XML files so aren't handled.
|
12
|
+
class Repository::ArtistXMLFile
|
13
|
+
def initialize(io)
|
14
|
+
@reader = Nokogiri::XML::Reader.from_io(io)
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.new_with_open_file(filename, &block)
|
18
|
+
File.open(filename, 'r') do |file|
|
19
|
+
yield new(file)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
# Codes used in their XML file format:
|
24
|
+
ARTIST_GENRE_RELATIONS = {
|
25
|
+
1 => :main,
|
26
|
+
2 => :secondary,
|
27
|
+
3 => :influenced_by
|
28
|
+
}
|
29
|
+
|
30
|
+
ASSOCIATION_TYPES = {
|
31
|
+
'A' => :similar,
|
32
|
+
'I' => :influenced_by,
|
33
|
+
'S' => :successor
|
34
|
+
}
|
35
|
+
|
36
|
+
include Enumerable
|
37
|
+
def get_all; self; end
|
38
|
+
|
39
|
+
def each
|
40
|
+
@reader.each do |node|
|
41
|
+
next unless node.name == 'artiste' && node.node_type == Nokogiri::XML::Reader::TYPE_ELEMENT
|
42
|
+
doc = Nokogiri::XML(node.outer_xml)
|
43
|
+
|
44
|
+
# extract genres
|
45
|
+
genres = Hash.new {|h,k| h[k]=[]}
|
46
|
+
genres_and_relation = doc.xpath('//artiste/genres/genre').map do |node|
|
47
|
+
genre = Model::Genre.new(
|
48
|
+
:id => to_i_or_nil(node.attr('id')),
|
49
|
+
:name => node.inner_text.strip
|
50
|
+
)
|
51
|
+
[genre, ARTIST_GENRE_RELATIONS[to_i_or_nil(node.attr('relation'))]]
|
52
|
+
end
|
53
|
+
|
54
|
+
genres_and_relation.uniq.each do |genre, relation|
|
55
|
+
genres[relation] << genre
|
56
|
+
end
|
57
|
+
|
58
|
+
# extract associations
|
59
|
+
associations = Hash.new {|h,k| h[k]=[]}
|
60
|
+
associated_artists_and_type = doc.xpath('//artiste/associes/associe').map do |node|
|
61
|
+
artist = Model::Artist.new({
|
62
|
+
:id => to_i_or_nil(node.attr('id_associe')),
|
63
|
+
:name => node.attr('nom_associe')
|
64
|
+
})
|
65
|
+
[artist, ASSOCIATION_TYPES[node.inner_text]]
|
66
|
+
end
|
67
|
+
|
68
|
+
associated_artists_and_type.uniq.each do |artist, type|
|
69
|
+
# FIXME track non-failing errors, rather than keeping quiet about it
|
70
|
+
associations[type] << artist unless invalid_artist?(artist)
|
71
|
+
end
|
72
|
+
|
73
|
+
yield Model::Artist.new({
|
74
|
+
:id => to_i_or_nil(doc.xpath('//artiste').attr('id').value),
|
75
|
+
:name => doc.xpath('//artiste/nom').inner_text,
|
76
|
+
:forename => unless_empty(doc.xpath('//artiste/prenom').inner_text),
|
77
|
+
:real_name => unless_empty(doc.xpath('//artiste/nom_reel').inner_text),
|
78
|
+
:role => unless_empty(doc.xpath('//artiste/role').inner_text),
|
79
|
+
:type => unless_empty(doc.xpath('//artiste/type').inner_text),
|
80
|
+
:country => unless_empty(doc.xpath('//artiste/pays').inner_text),
|
81
|
+
# not sure what the appropriate translation for resume vs texte_bio is here,
|
82
|
+
# but in data seen so far they are both the same and both HTML not plain text:
|
83
|
+
:summary_html => unless_empty(doc.xpath('//artiste/resume').inner_text),
|
84
|
+
:image_filename => unless_empty(doc.xpath('//artiste/image').inner_text),
|
85
|
+
|
86
|
+
:bio_html => unless_empty(doc.xpath('//artiste/texte_bio').inner_text),
|
87
|
+
:main_genres => genres[:main],
|
88
|
+
:secondary_genres => genres[:secondary],
|
89
|
+
:influenced_by_genres => genres[:influenced_by],
|
90
|
+
:similar_artists => associations[:similar],
|
91
|
+
:influenced_by_artists => associations[:influenced_by],
|
92
|
+
:successor_artists => associations[:successor]
|
93
|
+
})
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
private
|
98
|
+
def unless_empty(string)
|
99
|
+
string = string.strip
|
100
|
+
string unless string.empty?
|
101
|
+
end
|
102
|
+
|
103
|
+
# basic check that core artist properties are there and correct
|
104
|
+
def invalid_artist?(artist)
|
105
|
+
artist.name.nil? || artist.name.strip.empty? ||
|
106
|
+
artist.id.nil? || /[0-9]+/.match(artist.id.to_s).nil?
|
107
|
+
end
|
108
|
+
|
109
|
+
def to_i_or_nil(value)
|
110
|
+
/[0-9]+/.match(value.to_s) && value.to_i
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
@@ -0,0 +1,178 @@
|
|
1
|
+
require 'net/sftp'
|
2
|
+
require 'music_story/model/batch'
|
3
|
+
|
4
|
+
module MusicStory
|
5
|
+
|
6
|
+
# Gives access to whole batches of music story data that are kept somewhere
|
7
|
+
# on an sftp site somwhere that music story can constantly deliver new data.
|
8
|
+
# Once downloaded, this data can be accessed using the xml repository, or
|
9
|
+
# imported in to a local database
|
10
|
+
class Repository::BatchSFTP
|
11
|
+
|
12
|
+
# the presence of this file inside a batch directory tells us that the sender
|
13
|
+
# has finished sending it
|
14
|
+
DELIVERY_COMPLETE = 'delivery.complete'
|
15
|
+
|
16
|
+
# some sub dirs we use to manage the flow of data
|
17
|
+
DIR_PROCESSING = 'processing'
|
18
|
+
DIR_PROCESSED = 'processed'
|
19
|
+
|
20
|
+
# memoised flag from sftp rename methods
|
21
|
+
RENAME_NATIVE = Net::SFTP::Constants::RenameFlags::NATIVE
|
22
|
+
|
23
|
+
def initialize(host, username, options={})
|
24
|
+
@host = host
|
25
|
+
@username = username
|
26
|
+
@options = options
|
27
|
+
@basedir = options[:basedir] || '/'
|
28
|
+
@batch_pattern = options[:batch_pattern] || 'music-story-data-*'
|
29
|
+
@logger = options[:logger] || Logger.new('/dev/null')
|
30
|
+
end
|
31
|
+
|
32
|
+
# start talking to the remote server, yielding the session to the block,
|
33
|
+
# which is closed after the block finishes executing.
|
34
|
+
# The block is yielded a wrapper object that lets you use the access methods
|
35
|
+
# in the repository, minus the first argument, for instance:
|
36
|
+
# repo.connect do |session|
|
37
|
+
# batch = session.new_batches.first
|
38
|
+
# session.download(batch, '/tmp/dir')
|
39
|
+
# end
|
40
|
+
def connect(&block)
|
41
|
+
return_result = nil
|
42
|
+
# the sftp.start method does not seem to return the last thing you execute
|
43
|
+
start_sftp_session do |sftp_session|
|
44
|
+
return_result = yield SessionWrapper.new(self, sftp_session)
|
45
|
+
end
|
46
|
+
return_result
|
47
|
+
end
|
48
|
+
|
49
|
+
def start_sftp_session(&block)
|
50
|
+
cnx_options = (@options[:net_sftp_options] || {}).
|
51
|
+
merge(:password => @options[:password])
|
52
|
+
|
53
|
+
@logger.info("Starting sftp session to '#{@host}'")
|
54
|
+
Net::SFTP.start(@host, @username, cnx_options) do |sftp_session|
|
55
|
+
block.call(sftp_session)
|
56
|
+
end.tap do
|
57
|
+
@logger.info("Finished sftp session to '#{@host}'")
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
#
|
62
|
+
# the following methods should be accessed by using connect, and not directly
|
63
|
+
#
|
64
|
+
|
65
|
+
# return a list of batches on the sftp that are in the processed state, i.e
|
66
|
+
# live in the `processed` directory
|
67
|
+
def processed_batches(w)
|
68
|
+
dir = join(@basedir, DIR_PROCESSED)
|
69
|
+
w.sftp.dir[dir, '*'].map do |entry|
|
70
|
+
Model::Batch.new(:path => join(dir, entry.name), :state => :processed)
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
# return a list of batches on the sftp site that are in the processing
|
75
|
+
# state, i.e live in the `processing` directory
|
76
|
+
def processing_batches(w)
|
77
|
+
dir = join(@basedir, DIR_PROCESSING)
|
78
|
+
w.sftp.dir[dir, '*'].map do |entry|
|
79
|
+
Model::Batch.new(:path => join(dir, entry.name), :state => :processing)
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
# return a list of all the batches on the sftp site that are ready to
|
84
|
+
# be downloaded or we can start processing them
|
85
|
+
def new_batches(w)
|
86
|
+
@logger.debug("Looking for new batches in remote dir '#@basedir' with pattern #@batch_pattern")
|
87
|
+
complete_dirs = w.sftp.dir[@basedir, @batch_pattern].select do |entry|
|
88
|
+
next if /\.log$/.match(entry.name) # skip log files (MSP#1915)
|
89
|
+
|
90
|
+
w.sftp.dir[join(@basedir, entry.name), DELIVERY_COMPLETE].any?.tap do |f|
|
91
|
+
if f
|
92
|
+
@logger.debug(" Found new batch: #{entry.name}")
|
93
|
+
else
|
94
|
+
@logger.debug(" Incomplete batch: #{entry.name}")
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
complete_dirs.map do |entry|
|
100
|
+
Model::Batch.new(:path => join(@basedir, entry.name),
|
101
|
+
:state => :new)
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
# download a batch. Should work for a batch in any state
|
106
|
+
def download(w, batch, local_dir)
|
107
|
+
@logger.info("Downloading #{batch.path} to #{local_dir}...")
|
108
|
+
w.sftp.download!(batch.path, local_dir, :recursive => true) do |event, downloader, *args|
|
109
|
+
case event
|
110
|
+
when :open then
|
111
|
+
# args[0] : file metadata
|
112
|
+
@logger.debug "Starting download: #{args[0].remote} -> #{args[0].local} (#{args[0].size}) bytes"
|
113
|
+
when :close then
|
114
|
+
# args[0] : file metadata
|
115
|
+
@logger.debug "Finished download: #{args[0].remote}"
|
116
|
+
when :finish then
|
117
|
+
@logger.debug "Download complete"
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
# return true if there are any batches available
|
123
|
+
def new_available?(w)
|
124
|
+
new_batches(w).any?
|
125
|
+
end
|
126
|
+
|
127
|
+
# move a batch in to the processing state, moving its location on the remote
|
128
|
+
# fs
|
129
|
+
def mark_processing(w, batch)
|
130
|
+
new_name = join(@basedir, DIR_PROCESSING, File.basename(batch.path))
|
131
|
+
@logger.info("Marking #{batch} as processing")
|
132
|
+
@logger.debug(" moving #{batch.path} to #{new_name}")
|
133
|
+
w.sftp.rename(batch.path, new_name, RENAME_NATIVE)
|
134
|
+
batch.path = new_name
|
135
|
+
batch.state = :processing
|
136
|
+
end
|
137
|
+
|
138
|
+
# move a batch in to the processed state, moving its location on the remote
|
139
|
+
# fs
|
140
|
+
def mark_processed(w, batch, path_to_logfile=nil)
|
141
|
+
batch_basename = File.basename(batch.path)
|
142
|
+
new_name = join(@basedir, DIR_PROCESSED, batch_basename)
|
143
|
+
@logger.info("Marking #{batch} as processed")
|
144
|
+
@logger.debug(" moving #{batch.path} to #{new_name}")
|
145
|
+
w.sftp.rename(batch.path, new_name, RENAME_NATIVE)
|
146
|
+
batch.path = new_name
|
147
|
+
batch.state = :processed
|
148
|
+
|
149
|
+
if path_to_logfile
|
150
|
+
remote_logfile_path = join(@basedir, batch_basename + '.log')
|
151
|
+
uploader = w.sftp.upload!(path_to_logfile, remote_logfile_path)
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
private
|
156
|
+
|
157
|
+
# less chars ftw
|
158
|
+
def join(*args) ; File.join(*args) ; end
|
159
|
+
end
|
160
|
+
|
161
|
+
class Repository::BatchSFTP::SessionWrapper
|
162
|
+
|
163
|
+
attr_reader :sftp
|
164
|
+
|
165
|
+
def initialize(repository, sftp_session)
|
166
|
+
@repository = repository
|
167
|
+
@sftp = sftp_session
|
168
|
+
end
|
169
|
+
|
170
|
+
def method_missing(name, *args, &block)
|
171
|
+
@repository.send(name, *([self] + args), &block)
|
172
|
+
end
|
173
|
+
|
174
|
+
def respond_to?(name)
|
175
|
+
@repository.respond_to?(name)
|
176
|
+
end
|
177
|
+
end
|
178
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
module MusicStory
|
2
|
+
class Repository::Sequel
|
3
|
+
def initialize(db)
|
4
|
+
db = Sequel.connect(db) unless db.is_a?(Sequel::Database)
|
5
|
+
@db = db
|
6
|
+
@artist_repo = Repository::ArtistSequel.new(db)
|
7
|
+
@genre_repo = Repository::GenreSequel.new(db)
|
8
|
+
[:similar_artists, :influenced_by_artists, :successor_artists].each do |prop|
|
9
|
+
@artist_repo.mapper(prop).target_repo = @artist_repo
|
10
|
+
end
|
11
|
+
[:main_genres, :secondary_genres, :influenced_by_genres].each do |prop|
|
12
|
+
@artist_repo.mapper(prop).target_repo = @genre_repo
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
attr_reader :artist_repo, :genre_repo, :db
|
17
|
+
|
18
|
+
def drop_tables!
|
19
|
+
[:genres, :artist_associations, :artist_genres, :artists].each do |table|
|
20
|
+
begin ; @db.drop_table(table) ; rescue ; end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def create_tables!
|
25
|
+
@db.create_table(:artist_associations, :ignore_index_errors=>true) do
|
26
|
+
Integer :from_artist_id, :null=>false
|
27
|
+
Integer :to_artist_id, :null=>false
|
28
|
+
String :relation, :size=>16
|
29
|
+
|
30
|
+
primary_key [:from_artist_id, :relation, :to_artist_id]
|
31
|
+
|
32
|
+
index [:to_artist_id], :name=>:to_artist_id
|
33
|
+
end
|
34
|
+
|
35
|
+
@db.create_table(:artist_genres, :ignore_index_errors=>true) do
|
36
|
+
Integer :artist_id, :null=>false
|
37
|
+
Integer :genre_id, :null=>false
|
38
|
+
String :relation, :size=>16
|
39
|
+
|
40
|
+
primary_key [:artist_id, :relation, :genre_id]
|
41
|
+
|
42
|
+
index [:genre_id], :name=>:genre_id
|
43
|
+
end
|
44
|
+
|
45
|
+
@db.create_table(:artists, :ignore_index_errors=>true) do
|
46
|
+
primary_key :id
|
47
|
+
String :name, :null=>false, :size=>255
|
48
|
+
String :forename, :size=>255
|
49
|
+
String :real_name, :size=>255
|
50
|
+
String :role, :size=>64
|
51
|
+
String :type, :size=>64
|
52
|
+
String :country, :size=>64
|
53
|
+
String :summary_html, :text=>true
|
54
|
+
String :bio_html, :text=>true
|
55
|
+
String :image_filename, :text=>true
|
56
|
+
|
57
|
+
index [:name], :name=>:name
|
58
|
+
end
|
59
|
+
|
60
|
+
@db.create_table(:genres) do
|
61
|
+
primary_key :id
|
62
|
+
String :name, :null=>false, :size=>255
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module MusicStory
|
2
|
+
# Converts HTML to plain text, converting <br>'s into newlines but
|
3
|
+
# stripping all other tags.
|
4
|
+
# May want to add support for other things like <p> into \n\n if they
|
5
|
+
# crop up; MusicStory only seems to use <br> though
|
6
|
+
class Utils::HTMLToText < Nokogiri::XML::SAX::Document
|
7
|
+
def self.convert(html)
|
8
|
+
doc = new
|
9
|
+
Nokogiri::HTML::SAX::Parser.new(doc).parse(html)
|
10
|
+
doc.to_s
|
11
|
+
end
|
12
|
+
|
13
|
+
def initialize
|
14
|
+
@result = ''
|
15
|
+
end
|
16
|
+
|
17
|
+
def characters(string)
|
18
|
+
@result << string
|
19
|
+
end
|
20
|
+
alias :cdata_block :characters
|
21
|
+
|
22
|
+
def start_element(name, attributes=nil)
|
23
|
+
@result << "\n" if name.downcase == 'br'
|
24
|
+
end
|
25
|
+
|
26
|
+
def to_s
|
27
|
+
@result.strip
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
module MusicStory
|
2
|
+
class Utils::XMLToDBImporter
|
3
|
+
def self.import_file_into_db(filename, db, create_tables=false)
|
4
|
+
sequel_repos = Repository::Sequel.new(db)
|
5
|
+
sequel_repos.create_tables! if create_tables
|
6
|
+
results = Repository::ArtistXMLFile.new_with_open_file(filename) do |xml_repo|
|
7
|
+
new(xml_repo, sequel_repos).import
|
8
|
+
end
|
9
|
+
|
10
|
+
return results.merge(:sequel_repos => sequel_repos)
|
11
|
+
end
|
12
|
+
|
13
|
+
def initialize(xml_repo, sequel_repos)
|
14
|
+
@xml_repo = xml_repo
|
15
|
+
@sequel_repos = sequel_repos
|
16
|
+
end
|
17
|
+
|
18
|
+
def import
|
19
|
+
failures = []
|
20
|
+
successes = []
|
21
|
+
|
22
|
+
@xml_repo.each do |artist|
|
23
|
+
begin
|
24
|
+
@sequel_repos.artist_repo.transaction do
|
25
|
+
artist.all_associated_artists.each do |a|
|
26
|
+
@sequel_repos.artist_repo.store(a)
|
27
|
+
end
|
28
|
+
artist.all_genres.each do |g|
|
29
|
+
@sequel_repos.genre_repo.store(g)
|
30
|
+
end
|
31
|
+
@sequel_repos.artist_repo.store(artist)
|
32
|
+
end
|
33
|
+
rescue => e
|
34
|
+
failures << [artist, e]
|
35
|
+
end
|
36
|
+
|
37
|
+
successes << artist
|
38
|
+
end
|
39
|
+
|
40
|
+
{:successes => successes, :failures => failures}
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
metadata
ADDED
@@ -0,0 +1,197 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: music_story
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 19
|
5
|
+
prerelease:
|
6
|
+
segments:
|
7
|
+
- 1
|
8
|
+
- 0
|
9
|
+
- 2
|
10
|
+
version: 1.0.2
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- Matthew Willson
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2012-10-03 00:00:00 Z
|
19
|
+
dependencies:
|
20
|
+
- !ruby/object:Gem::Dependency
|
21
|
+
name: sequel
|
22
|
+
prerelease: false
|
23
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
24
|
+
none: false
|
25
|
+
requirements:
|
26
|
+
- - ~>
|
27
|
+
- !ruby/object:Gem::Version
|
28
|
+
hash: 43
|
29
|
+
segments:
|
30
|
+
- 3
|
31
|
+
- 11
|
32
|
+
- 0
|
33
|
+
version: 3.11.0
|
34
|
+
type: :runtime
|
35
|
+
version_requirements: *id001
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: persistence
|
38
|
+
prerelease: false
|
39
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
40
|
+
none: false
|
41
|
+
requirements:
|
42
|
+
- - ~>
|
43
|
+
- !ruby/object:Gem::Version
|
44
|
+
hash: 17
|
45
|
+
segments:
|
46
|
+
- 0
|
47
|
+
- 3
|
48
|
+
- 1
|
49
|
+
version: 0.3.1
|
50
|
+
type: :runtime
|
51
|
+
version_requirements: *id002
|
52
|
+
- !ruby/object:Gem::Dependency
|
53
|
+
name: thin_models
|
54
|
+
prerelease: false
|
55
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
56
|
+
none: false
|
57
|
+
requirements:
|
58
|
+
- - ~>
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
hash: 19
|
61
|
+
segments:
|
62
|
+
- 0
|
63
|
+
- 1
|
64
|
+
- 4
|
65
|
+
version: 0.1.4
|
66
|
+
type: :runtime
|
67
|
+
version_requirements: *id003
|
68
|
+
- !ruby/object:Gem::Dependency
|
69
|
+
name: nokogiri
|
70
|
+
prerelease: false
|
71
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
72
|
+
none: false
|
73
|
+
requirements:
|
74
|
+
- - ~>
|
75
|
+
- !ruby/object:Gem::Version
|
76
|
+
hash: 3
|
77
|
+
segments:
|
78
|
+
- 1
|
79
|
+
- 5
|
80
|
+
- 0
|
81
|
+
version: 1.5.0
|
82
|
+
type: :runtime
|
83
|
+
version_requirements: *id004
|
84
|
+
- !ruby/object:Gem::Dependency
|
85
|
+
name: net-sftp
|
86
|
+
prerelease: false
|
87
|
+
requirement: &id005 !ruby/object:Gem::Requirement
|
88
|
+
none: false
|
89
|
+
requirements:
|
90
|
+
- - ">="
|
91
|
+
- !ruby/object:Gem::Version
|
92
|
+
hash: 3
|
93
|
+
segments:
|
94
|
+
- 0
|
95
|
+
version: "0"
|
96
|
+
type: :runtime
|
97
|
+
version_requirements: *id005
|
98
|
+
- !ruby/object:Gem::Dependency
|
99
|
+
name: minitest
|
100
|
+
prerelease: false
|
101
|
+
requirement: &id006 !ruby/object:Gem::Requirement
|
102
|
+
none: false
|
103
|
+
requirements:
|
104
|
+
- - ">="
|
105
|
+
- !ruby/object:Gem::Version
|
106
|
+
hash: 3
|
107
|
+
segments:
|
108
|
+
- 0
|
109
|
+
version: "0"
|
110
|
+
type: :development
|
111
|
+
version_requirements: *id006
|
112
|
+
- !ruby/object:Gem::Dependency
|
113
|
+
name: sqlite3
|
114
|
+
prerelease: false
|
115
|
+
requirement: &id007 !ruby/object:Gem::Requirement
|
116
|
+
none: false
|
117
|
+
requirements:
|
118
|
+
- - ">="
|
119
|
+
- !ruby/object:Gem::Version
|
120
|
+
hash: 3
|
121
|
+
segments:
|
122
|
+
- 0
|
123
|
+
version: "0"
|
124
|
+
type: :development
|
125
|
+
version_requirements: *id007
|
126
|
+
- !ruby/object:Gem::Dependency
|
127
|
+
name: mock_sftp
|
128
|
+
prerelease: false
|
129
|
+
requirement: &id008 !ruby/object:Gem::Requirement
|
130
|
+
none: false
|
131
|
+
requirements:
|
132
|
+
- - ">="
|
133
|
+
- !ruby/object:Gem::Version
|
134
|
+
hash: 3
|
135
|
+
segments:
|
136
|
+
- 0
|
137
|
+
version: "0"
|
138
|
+
type: :development
|
139
|
+
version_requirements: *id008
|
140
|
+
description:
|
141
|
+
email:
|
142
|
+
- matthew@playlouder.com
|
143
|
+
executables: []
|
144
|
+
|
145
|
+
extensions: []
|
146
|
+
|
147
|
+
extra_rdoc_files: []
|
148
|
+
|
149
|
+
files:
|
150
|
+
- lib/music_story/version.rb
|
151
|
+
- lib/music_story/model/genre.rb
|
152
|
+
- lib/music_story/model/batch.rb
|
153
|
+
- lib/music_story/model/artist.rb
|
154
|
+
- lib/music_story/utils/html_to_text.rb
|
155
|
+
- lib/music_story/utils/xml_to_db_importer.rb
|
156
|
+
- lib/music_story/repository/sequel.rb
|
157
|
+
- lib/music_story/repository/artist_xml_file.rb
|
158
|
+
- lib/music_story/repository/batch_sftp.rb
|
159
|
+
- lib/music_story/repository/artist_sequel.rb
|
160
|
+
- lib/music_story/repository/genre_sequel.rb
|
161
|
+
- lib/music_story.rb
|
162
|
+
- README.txt
|
163
|
+
homepage:
|
164
|
+
licenses: []
|
165
|
+
|
166
|
+
post_install_message:
|
167
|
+
rdoc_options: []
|
168
|
+
|
169
|
+
require_paths:
|
170
|
+
- lib
|
171
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
172
|
+
none: false
|
173
|
+
requirements:
|
174
|
+
- - ">="
|
175
|
+
- !ruby/object:Gem::Version
|
176
|
+
hash: 3
|
177
|
+
segments:
|
178
|
+
- 0
|
179
|
+
version: "0"
|
180
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
181
|
+
none: false
|
182
|
+
requirements:
|
183
|
+
- - ">="
|
184
|
+
- !ruby/object:Gem::Version
|
185
|
+
hash: 3
|
186
|
+
segments:
|
187
|
+
- 0
|
188
|
+
version: "0"
|
189
|
+
requirements: []
|
190
|
+
|
191
|
+
rubyforge_project:
|
192
|
+
rubygems_version: 1.8.24
|
193
|
+
signing_key:
|
194
|
+
specification_version: 3
|
195
|
+
summary: Wrapper code for the MusicStory data product
|
196
|
+
test_files: []
|
197
|
+
|