unclekryon 0.4.9.pre.alpha

Sign up to get free protection for your applications and to get access to all the features.
Files changed (45) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +34 -0
  3. data/Gemfile.lock +43 -0
  4. data/LICENSE +674 -0
  5. data/README.md +55 -0
  6. data/Rakefile +59 -0
  7. data/bin/unclekryon +30 -0
  8. data/iso/can_provs_terrs.yaml +54 -0
  9. data/iso/countries.yaml +3050 -0
  10. data/iso/iso.yaml +8 -0
  11. data/iso/languages.yaml +5641 -0
  12. data/iso/regions.yaml +42 -0
  13. data/iso/subregions.yaml +6 -0
  14. data/iso/usa_states.yaml +230 -0
  15. data/lib/unclekryon.rb +384 -0
  16. data/lib/unclekryon/data/album_data.rb +147 -0
  17. data/lib/unclekryon/data/artist_data.rb +109 -0
  18. data/lib/unclekryon/data/artist_data_data.rb +146 -0
  19. data/lib/unclekryon/data/aum_data.rb +75 -0
  20. data/lib/unclekryon/data/base_data.rb +79 -0
  21. data/lib/unclekryon/data/pic_data.rb +76 -0
  22. data/lib/unclekryon/data/release_data.rb +57 -0
  23. data/lib/unclekryon/data/social_data.rb +39 -0
  24. data/lib/unclekryon/data/timespan_data.rb +70 -0
  25. data/lib/unclekryon/dev_opts.rb +41 -0
  26. data/lib/unclekryon/hacker.rb +327 -0
  27. data/lib/unclekryon/iso.rb +341 -0
  28. data/lib/unclekryon/iso/base_iso.rb +196 -0
  29. data/lib/unclekryon/iso/can_prov_terr.rb +113 -0
  30. data/lib/unclekryon/iso/country.rb +133 -0
  31. data/lib/unclekryon/iso/language.rb +241 -0
  32. data/lib/unclekryon/iso/region.rb +53 -0
  33. data/lib/unclekryon/iso/subregion.rb +53 -0
  34. data/lib/unclekryon/iso/usa_state.rb +106 -0
  35. data/lib/unclekryon/jsoner.rb +124 -0
  36. data/lib/unclekryon/log.rb +111 -0
  37. data/lib/unclekryon/parsers/kryon_aum_year_album_parser.rb +499 -0
  38. data/lib/unclekryon/parsers/kryon_aum_year_parser.rb +413 -0
  39. data/lib/unclekryon/server.rb +29 -0
  40. data/lib/unclekryon/trainer.rb +231 -0
  41. data/lib/unclekryon/uploader.rb +29 -0
  42. data/lib/unclekryon/util.rb +228 -0
  43. data/lib/unclekryon/version.rb +26 -0
  44. data/unclekryon.gemspec +67 -0
  45. metadata +189 -0
@@ -0,0 +1,499 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: UTF-8
3
+ # frozen_string_literal: true
4
+
5
+ #--
6
+ # This file is part of UncleKryon-server.
7
+ # Copyright (c) 2017-2019 Jonathan Bradley Whited (@esotericpig)
8
+ #
9
+ # UncleKryon-server is free software: you can redistribute it and/or modify
10
+ # it under the terms of the GNU General Public License as published by
11
+ # the Free Software Foundation, either version 3 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # UncleKryon-server is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU General Public License
20
+ # along with UncleKryon-server. If not, see <https://www.gnu.org/licenses/>.
21
+ #++
22
+
23
+
24
+ require 'nokogiri'
25
+ require 'open-uri'
26
+
27
+ require 'unclekryon/dev_opts'
28
+ require 'unclekryon/iso'
29
+ require 'unclekryon/log'
30
+ require 'unclekryon/trainer'
31
+ require 'unclekryon/util'
32
+
33
+ require 'unclekryon/data/album_data'
34
+ require 'unclekryon/data/aum_data'
35
+ require 'unclekryon/data/pic_data'
36
+ require 'unclekryon/data/timespan_data'
37
+
38
+ module UncleKryon
39
+ class KryonAumYearAlbumParser
40
+ include Logging
41
+
42
+ attr_accessor :album
43
+ attr_accessor :artist
44
+ attr_accessor :options
45
+ attr_accessor :trainers
46
+ attr_accessor :training
47
+ attr_accessor :updated_on
48
+ attr_accessor :url
49
+
50
+ alias_method :training?,:training
51
+
52
+ def initialize(artist=nil,url=nil,album: nil,training: false,train_filepath: nil,updated_on: nil,
53
+ **options)
54
+ @album = album
55
+ @artist = artist
56
+ @options = options
57
+ @updated_on = Util.format_datetime(DateTime.now()) if Util.empty_s?(updated_on)
58
+ @url = url
59
+
60
+ @trainers = Trainers.new(train_filepath)
61
+ @training = training
62
+
63
+ @trainers['aum_year_album'] = Trainer.new({
64
+ 'alds'=>'album_dates',
65
+ 'altt'=>'album_title',
66
+ 'allo'=>'album_locations',
67
+ 'almi'=>'album_mini_desc',
68
+ 'alma'=>'album_main_desc',
69
+ 'aust'=>'aum_subtitle',
70
+ 'aulg'=>'aum_languages', # See 2018 "Montreal QB w/Robert Coxon (3)" aums' subtitles "FRENCH"
71
+ 'autt'=>'aum_title',
72
+ 'autm'=>'aum_timespan',
73
+ 'ausz'=>'aum_filesize',
74
+ 'aufn'=>'aum_filename',
75
+ 'audu'=>'dump',
76
+ 'i' =>'ignore'
77
+ })
78
+ @trainers['aum_year_album_mini_desc'] = Trainer.new({
79
+ 'd'=>'date',
80
+ 'l'=>'location',
81
+ 's'=>'desc',
82
+ 'i'=>'ignore'
83
+ })
84
+ end
85
+
86
+ def parse_site(artist=nil,url=nil)
87
+ @artist = artist unless artist.nil?()
88
+ @url = url unless url.nil?()
89
+
90
+ # URLs that return 404 or are empty; fix by hand
91
+ exclude_urls = /
92
+ awakeningzone\.com\/Episode\.aspx\?EpisodeID\=|
93
+ www\.talkshoe\.com\/talkshoe\/web\/audioPop\.jsp\?episodeId\=
94
+ /ix
95
+
96
+ if @url =~ exclude_urls
97
+ log.warn("Excluding Album URL #{@url}")
98
+ return
99
+ end
100
+
101
+ @trainers.load_file()
102
+
103
+ raise ArgumentError,"Artist cannot be nil" if @artist.nil?()
104
+ raise ArgumentError,"URL cannot be empty" if @url.nil?() || (@url = @url.strip()).empty?()
105
+
106
+ # Album data (flags are okay) should never go in this, only for aums, pics, etc.
107
+ @local_dump = {
108
+ :album_dates=>false,
109
+ :album_title=>false,
110
+ :album_locations=>false,
111
+ :album_mini_desc=>false,
112
+ :album_main_desc=>false,
113
+ :aums=>0,
114
+ :aum_subtitle=>[],
115
+ :aum_languages=>[],
116
+ :aum_title=>[],
117
+ :aum_timespan=>[],
118
+ :aum_filesize=>[],
119
+ :aum_filename=>[]
120
+ }
121
+
122
+ # Force 'utf-8'
123
+ # - See charset "X-MAC-ROMAN" in 2017 "The Discovery Series", 2016 "Kryon in Budapest (5)"
124
+ doc = Nokogiri::HTML(open(@url),nil,'utf-8')
125
+
126
+ old_album = @artist.albums[@url]
127
+
128
+ @album = old_album.clone()
129
+ @album.updated_on = @updated_on
130
+ @album.url = @url
131
+
132
+ if old_album.nil?()
133
+ @artist.albums[@url] = @album
134
+ end
135
+
136
+ parse_dump(doc,@album) # Must be first because other methods rely on @local_dump
137
+
138
+ return @album if @training # Currently, no other training occurs
139
+
140
+ parse_pics(doc,@album)
141
+ parse_aums(doc,@album)
142
+
143
+ if @album == old_album
144
+ @album.updated_on = old_album.updated_on
145
+ end
146
+
147
+ @artist.albums[@url] = @album
148
+
149
+ return @album
150
+ end
151
+
152
+ def parse_aums(doc,album)
153
+ links = doc.css('a')
154
+
155
+ return if links.nil?
156
+
157
+ i = 0 # Don't do #each_with_index() because sometimes we next
158
+
159
+ links.each do |link|
160
+ next if link.nil?
161
+
162
+ audio_file_regex = /\.mp3/i
163
+ href = link['href']
164
+ exclude_links = /
165
+ files\.kryonespanol\.com\/audio\/
166
+ /ix
167
+
168
+ next if href.nil? || href.empty?
169
+ next if href !~ audio_file_regex
170
+ next if href =~ exclude_links
171
+
172
+ aum = AumData.new
173
+ aum.url = Util.clean_data(href)
174
+ aum.filename = Util.parse_url_filename(aum.url)
175
+ aum.updated_on = @updated_on
176
+
177
+ if aum.url =~ /\A\.\.?\//
178
+ aum.url = Util.clean_link(@url,aum.url)
179
+ end
180
+
181
+ # Filesize
182
+ if !DevOpts.instance.test?()
183
+ # Getting header data is slow, so only do it when not testing
184
+ begin
185
+ r = Util.get_url_header_data(aum.url)
186
+ aum.filesize = r['content-length']
187
+ aum.filesize = aum.filesize[0] if aum.filesize.is_a?(Array)
188
+ rescue => e
189
+ raise e.exception("#{e.message}; couldn't get header data for #{aum.url}")
190
+ end
191
+ end
192
+
193
+ # Subtitle
194
+ if i < @local_dump[:aum_subtitle].length
195
+ aum.subtitle = @local_dump[:aum_subtitle][i]
196
+ else
197
+ log.warn("No subtitle for: #{aum.filename},#{aum.url}")
198
+ end
199
+
200
+ # Languages
201
+ aum.languages = @local_dump[:aum_languages][i] if i < @local_dump[:aum_languages].length
202
+
203
+ # Title
204
+ if i < @local_dump[:aum_title].length
205
+ aum.title = @local_dump[:aum_title][i]
206
+ else
207
+ # Set title to something at least
208
+ if !(afn = aum.filename).nil?() && !afn.strip().empty?()
209
+ # More descriptive than subtitle
210
+ aum.title = afn.gsub(audio_file_regex,'').strip()
211
+ log.warn("Using filename as title: #{aum.title}")
212
+ else
213
+ aum.title = aum.subtitle
214
+ log.warn("Using subtitle as title: #{aum.title}")
215
+ end
216
+ end
217
+
218
+ # Timespan
219
+ if i < @local_dump[:aum_timespan].length
220
+ aum.timespan = @local_dump[:aum_timespan][i]
221
+ else
222
+ msg = "No timespan for: #{aum.title},#{aum.subtitle},#{aum.filename},#{aum.url}"
223
+
224
+ log.warn(msg)
225
+
226
+ #if DevOpts.instance.dev?()
227
+ # raise "#{msg}:\n#{@local_dump}\n#{album.dump}"
228
+ #else
229
+ # log.warn(msg)
230
+ #end
231
+ end
232
+
233
+ # Filesize, if not set
234
+ if (aum.filesize.nil?() || aum.filesize.strip().empty?) && i < @local_dump[:aum_filesize].length
235
+ aum.filesize = @local_dump[:aum_filesize][i]
236
+ log.warn("Using local dump filesize: #{aum.filesize}")
237
+ end
238
+
239
+ i += 1
240
+
241
+ # Is it old?
242
+ if album.aums.key?(aum.url) && aum == album.aums[aum.url]
243
+ aum.updated_on = album.aums[aum.url].updated_on
244
+ else # New
245
+ album.updated_on = @updated_on
246
+ end
247
+
248
+ album.aums[aum.url] = aum
249
+ end
250
+ end
251
+
252
+ def parse_dump(doc,album)
253
+ album.dump = []
254
+ tds = doc.css('td')
255
+
256
+ return if tds.nil?
257
+
258
+ filename_regex = /\.mp3[[:space:]]*\z/i
259
+ # 2017 "Petra, Jordan (5)" has a ":" in the megabytes cell
260
+ size_regex = /\A[[:space:]]*[[:digit:]]+(\.|\:|[[:digit:]]|[[:space:]])*megabytes[[:space:]]*\z/i
261
+ # 2017 "Monument Valley Tour (11)" has a "." in the minutes cell
262
+ # 2017 "SUMMER LIGHT CONFERENCE PANEL (1)" is a special case ("One hour 6 minutes - (66 minutes)")
263
+ time_regex = /
264
+ \A[[:space:]]*[[:digit:]]+(\:|\.|[[:digit:]]|[[:space:]])*(minutes|Min)[[:space:]]*\z|
265
+ \([[:space:]]*[[:digit:]]+[[:space:]]+minutes[[:space:]]*\)[[:space:]]*\z
266
+ /ix
267
+ # 2017 "KRYON INDIA-NEPAL TOUR PART 1 (10)" doesn't have the word "megabytes"
268
+ time_or_size_regex = /\A[[:space:]]*[[:digit:]]+(\:|\.|[[:digit:]]|[[:space:]])*\z/i
269
+ # 2015 ones have a lot of "13:12 Min - 15.9 megs"
270
+ time_and_size_regex = /\A[[:space:]]*[[:digit:]]+[\:\.][[:digit:]]+[[:space:]]+Min[[:space:]]+\-[[:space:]]+[[:digit:]]+\.?[[:digit:]]*[[:space:]]*megs/i
271
+
272
+ size_count = 0
273
+ time_count = 0
274
+
275
+ tds.each do |td|
276
+ next if td.nil?
277
+ next if td.content.nil?
278
+
279
+ orig_c = Util.clean_charset(td.content)
280
+ c = Util.clean_data(orig_c)
281
+
282
+ next if c.empty?
283
+ #if c =~ exclude_content_regex
284
+ # log.warn("Excluding content: #{c}")
285
+ # next
286
+ #end
287
+
288
+ add_to_dump = true
289
+
290
+ if c =~ time_regex
291
+ @local_dump[:aum_timespan].push(TimespanData.new(c).to_s())
292
+ add_to_dump = false
293
+ time_count += 1
294
+ elsif c =~ size_regex
295
+ @local_dump[:aum_filesize].push(c)
296
+ add_to_dump = false
297
+ size_count += 1
298
+ elsif c =~ time_or_size_regex
299
+ # Time is usually before size
300
+ if time_count == size_count
301
+ @local_dump[:aum_timespan].push(TimespanData.new(c).to_s())
302
+ time_count += 1
303
+ else
304
+ @local_dump[:aum_filesize].push(c)
305
+ size_count += 1
306
+ end
307
+
308
+ add_to_dump = false
309
+ elsif c =~ time_and_size_regex
310
+ time_and_size = c.split(/[[:space:]]*\-[[:space:]]*/) # Split on '-'
311
+
312
+ @local_dump[:aum_timespan].push(TimespanData.new(time_and_size[0]).to_s())
313
+ time_count += 1
314
+ @local_dump[:aum_filesize].push(time_and_size[1])
315
+ size_count += 1
316
+
317
+ add_to_dump = false
318
+ elsif c =~ filename_regex
319
+ @local_dump[:aums] += 1
320
+ add_to_dump = false
321
+ else
322
+ # Paragraphs
323
+ pars = orig_c.gsub(/\A[[:space:]]+/,'').gsub(/[[:space:]]+\z/,'')
324
+ pars = pars.split(/[\r\n\p{Zl}\p{Zp}]{2,}/)
325
+
326
+ pars.each() do |par|
327
+ par = par.gsub(/[[:blank:]]+/,' ').strip()
328
+ par = Util.fix_shortwith_text(par)
329
+
330
+ next if par.empty?()
331
+
332
+ if @training
333
+ if @trainers['aum_year_album'].train(par) == 'album_mini_desc'
334
+ par.split(/\n+/).each() do |p|
335
+ @trainers['aum_year_album_mini_desc'].train(p)
336
+ end
337
+ end
338
+ else
339
+ #has_header = @local_dump[:album_title] || @local_dump[:album_dates] ||
340
+ # @local_dump[:album_locations] || @local_dump[:album_mini_desc] || @local_dump[:album_main_desc]
341
+ has_header = true
342
+ tag = @trainers['aum_year_album'].tag(par)
343
+
344
+ # For 2017 "RETURN TO LEMURIA (7)"
345
+ if par =~ /\A[[:space:]]*MEDITATION[[:space:]]+-[[:space:]]+Kalei[[:space:]]+-[[:space:]]+John[[:space:]]+-[[:space:]]+Amber[[:space:]]*\z/i
346
+ tag = 'aum_title'
347
+ log.warn("Changing tag to aum_title: #{Util.clean_data(par)}")
348
+ end
349
+
350
+ case tag
351
+ when 'album_title'
352
+ if !@local_dump[:album_title]
353
+ @local_dump[:album_title] = true
354
+ end
355
+ when 'album_dates'
356
+ if !@local_dump[:album_dates]
357
+ @local_dump[:album_dates] = true
358
+ end
359
+ when 'album_locations'
360
+ if !@local_dump[:album_locations]
361
+ @local_dump[:album_locations] = true
362
+ end
363
+ when 'album_mini_desc'
364
+ par.split(/\n+/).each() do |p|
365
+ p = Util.clean_data(p)
366
+
367
+ if !p.empty?()
368
+ case @trainers['aum_year_album_mini_desc'].tag(p)
369
+ when 'desc'
370
+ if !@local_dump[:album_mini_desc]
371
+ @local_dump[:album_mini_desc] = true
372
+ album.mini_desc = p
373
+ else
374
+ album.mini_desc << ' | ' if !album.mini_desc.strip().empty?()
375
+ album.mini_desc << p
376
+ end
377
+ when 'ignore'
378
+ log.warn("Excluding mini desc content: #{p}")
379
+ end
380
+ end
381
+ end
382
+
383
+ add_to_dump = false
384
+ when 'album_main_desc'
385
+ if !@local_dump[:album_main_desc]
386
+ @local_dump[:album_main_desc] = true
387
+ album.main_desc = ''.dup()
388
+ else
389
+ album.main_desc << "\n\n" if !album.main_desc.strip().empty?()
390
+ end
391
+
392
+ par.split(/\n+/).each() do |p|
393
+ album.main_desc << Util.clean_data(p) << "\n"
394
+ end
395
+
396
+ album.main_desc = album.main_desc.strip() # Remove last newline
397
+ add_to_dump = false
398
+ when 'ignore'
399
+ log.warn("Excluding content: #{Util.clean_data(par)}")
400
+ add_to_dump = false
401
+ else
402
+ if !has_header
403
+ log.warn("No header yet so ignoring: #{Util.clean_data(par)}")
404
+ else
405
+ case tag
406
+ when 'aum_subtitle'
407
+ @local_dump[:aum_subtitle].push(Util.clean_data(par))
408
+ add_to_dump = false
409
+ when 'aum_languages'
410
+ p = Util.clean_data(par)
411
+ @local_dump[:aum_languages].push(Iso.languages.find_by_kryon(p))
412
+ @local_dump[:aum_subtitle].push(p)
413
+ add_to_dump = false
414
+ when 'aum_title'
415
+ @local_dump[:aum_title].push(Util.clean_data(par))
416
+
417
+ # Special case for 2017 "LISBON, PORTUGAL (Fatima Tour) (3)"
418
+ if par =~ /\A[[:space:]]*Lisbon[[:space:]]+Channeling[[:space:]]+1[[:space:]]*\z/i
419
+ @local_dump[:aum_title].push('Lisbon Channeling 2');
420
+ @local_dump[:aum_title].push('Lisbon Channeling 3');
421
+ log.warn("Adding aum_titles for: #{Util.clean_data(par)}")
422
+ end
423
+ # For 2017 "KRYON INDIA-NEPAL TOUR PART 1 (10)" & "KRYON INDIA-NEPAL TOUR PART 2 (8)"
424
+ if par =~ /\A[[:space:]]*PAGE[[:space:]]*(ONE|TWO)[[:space:]]*\z/i
425
+ p = @local_dump[:aum_title].pop()
426
+ log.warn("Ignoring aum title: #{p}")
427
+ end
428
+
429
+ add_to_dump = false
430
+ when 'aum_filename'
431
+ add_to_dump = false
432
+ end
433
+ end
434
+ end
435
+ end
436
+ end
437
+ end
438
+
439
+ if add_to_dump
440
+ album.dump.push(c)
441
+
442
+ # For now, don't do this; if the font size is big, it's bad for mobile anyway
443
+ #album.dump.push(Util.clean_data(td.to_s())) # For bold, etc. html
444
+ end
445
+ end
446
+ end
447
+
448
+ def parse_pics(doc,album)
449
+ imgs = doc.css('img')
450
+
451
+ return if imgs.nil?
452
+
453
+ exclude_imgs = /
454
+ buttonMP3\.png|
455
+ freedownloadtype\.gif|
456
+ handani\.gif|
457
+ Kryonglobe\.jpg|
458
+ MP3\-download\.jpg|
459
+ MP3\-graphic\(SM\)\.jpg|
460
+ NavMenu\_AUDIOmaster\.png|
461
+ NavMenu\_master\.png|
462
+ testimonials\.png
463
+ /ix
464
+
465
+ imgs.each do |img|
466
+ next if img.nil?
467
+
468
+ src = img['src']
469
+
470
+ next if src.nil? || src.empty?
471
+ if src =~ exclude_imgs
472
+ log.warn("Excluding image: #{src}")
473
+ next
474
+ end
475
+
476
+ pic = PicData.new()
477
+
478
+ pic.url = Util.clean_link(url,src)
479
+ pic.filename = Util.parse_url_filename(pic.url)
480
+
481
+ pic.alt = img['alt']
482
+ pic.alt = '' if Util.empty_s?(pic.alt)
483
+ pic.caption = ''
484
+
485
+ pic.name = Util.empty_s?(pic.alt) ? File.basename(pic.filename,File.extname(pic.filename)) : pic.alt
486
+ pic.updated_on = @updated_on
487
+
488
+ # Is it old?
489
+ if album.pics.key?(pic.url) && pic == album.pics[pic.url]
490
+ pic.updated_on = album.pics[pic.url].updated_on
491
+ else # New
492
+ album.updated_on = @updated_on
493
+ end
494
+
495
+ album.pics[pic.url] = pic
496
+ end
497
+ end
498
+ end
499
+ end