unclekryon 0.4.9.pre.alpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +34 -0
  3. data/Gemfile.lock +43 -0
  4. data/LICENSE +674 -0
  5. data/README.md +55 -0
  6. data/Rakefile +59 -0
  7. data/bin/unclekryon +30 -0
  8. data/iso/can_provs_terrs.yaml +54 -0
  9. data/iso/countries.yaml +3050 -0
  10. data/iso/iso.yaml +8 -0
  11. data/iso/languages.yaml +5641 -0
  12. data/iso/regions.yaml +42 -0
  13. data/iso/subregions.yaml +6 -0
  14. data/iso/usa_states.yaml +230 -0
  15. data/lib/unclekryon.rb +384 -0
  16. data/lib/unclekryon/data/album_data.rb +147 -0
  17. data/lib/unclekryon/data/artist_data.rb +109 -0
  18. data/lib/unclekryon/data/artist_data_data.rb +146 -0
  19. data/lib/unclekryon/data/aum_data.rb +75 -0
  20. data/lib/unclekryon/data/base_data.rb +79 -0
  21. data/lib/unclekryon/data/pic_data.rb +76 -0
  22. data/lib/unclekryon/data/release_data.rb +57 -0
  23. data/lib/unclekryon/data/social_data.rb +39 -0
  24. data/lib/unclekryon/data/timespan_data.rb +70 -0
  25. data/lib/unclekryon/dev_opts.rb +41 -0
  26. data/lib/unclekryon/hacker.rb +327 -0
  27. data/lib/unclekryon/iso.rb +341 -0
  28. data/lib/unclekryon/iso/base_iso.rb +196 -0
  29. data/lib/unclekryon/iso/can_prov_terr.rb +113 -0
  30. data/lib/unclekryon/iso/country.rb +133 -0
  31. data/lib/unclekryon/iso/language.rb +241 -0
  32. data/lib/unclekryon/iso/region.rb +53 -0
  33. data/lib/unclekryon/iso/subregion.rb +53 -0
  34. data/lib/unclekryon/iso/usa_state.rb +106 -0
  35. data/lib/unclekryon/jsoner.rb +124 -0
  36. data/lib/unclekryon/log.rb +111 -0
  37. data/lib/unclekryon/parsers/kryon_aum_year_album_parser.rb +499 -0
  38. data/lib/unclekryon/parsers/kryon_aum_year_parser.rb +413 -0
  39. data/lib/unclekryon/server.rb +29 -0
  40. data/lib/unclekryon/trainer.rb +231 -0
  41. data/lib/unclekryon/uploader.rb +29 -0
  42. data/lib/unclekryon/util.rb +228 -0
  43. data/lib/unclekryon/version.rb +26 -0
  44. data/unclekryon.gemspec +67 -0
  45. metadata +189 -0
@@ -0,0 +1,499 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: UTF-8
3
+ # frozen_string_literal: true
4
+
5
+ #--
6
+ # This file is part of UncleKryon-server.
7
+ # Copyright (c) 2017-2019 Jonathan Bradley Whited (@esotericpig)
8
+ #
9
+ # UncleKryon-server is free software: you can redistribute it and/or modify
10
+ # it under the terms of the GNU General Public License as published by
11
+ # the Free Software Foundation, either version 3 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # UncleKryon-server is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU General Public License
20
+ # along with UncleKryon-server. If not, see <https://www.gnu.org/licenses/>.
21
+ #++
22
+
23
+
24
+ require 'nokogiri'
25
+ require 'open-uri'
26
+
27
+ require 'unclekryon/dev_opts'
28
+ require 'unclekryon/iso'
29
+ require 'unclekryon/log'
30
+ require 'unclekryon/trainer'
31
+ require 'unclekryon/util'
32
+
33
+ require 'unclekryon/data/album_data'
34
+ require 'unclekryon/data/aum_data'
35
+ require 'unclekryon/data/pic_data'
36
+ require 'unclekryon/data/timespan_data'
37
+
38
+ module UncleKryon
39
+ class KryonAumYearAlbumParser
40
+ include Logging
41
+
42
+ attr_accessor :album
43
+ attr_accessor :artist
44
+ attr_accessor :options
45
+ attr_accessor :trainers
46
+ attr_accessor :training
47
+ attr_accessor :updated_on
48
+ attr_accessor :url
49
+
50
+ alias_method :training?,:training
51
+
52
+ def initialize(artist=nil,url=nil,album: nil,training: false,train_filepath: nil,updated_on: nil,
53
+ **options)
54
+ @album = album
55
+ @artist = artist
56
+ @options = options
57
+ @updated_on = Util.format_datetime(DateTime.now()) if Util.empty_s?(updated_on)
58
+ @url = url
59
+
60
+ @trainers = Trainers.new(train_filepath)
61
+ @training = training
62
+
63
+ @trainers['aum_year_album'] = Trainer.new({
64
+ 'alds'=>'album_dates',
65
+ 'altt'=>'album_title',
66
+ 'allo'=>'album_locations',
67
+ 'almi'=>'album_mini_desc',
68
+ 'alma'=>'album_main_desc',
69
+ 'aust'=>'aum_subtitle',
70
+ 'aulg'=>'aum_languages', # See 2018 "Montreal QB w/Robert Coxon (3)" aums' subtitles "FRENCH"
71
+ 'autt'=>'aum_title',
72
+ 'autm'=>'aum_timespan',
73
+ 'ausz'=>'aum_filesize',
74
+ 'aufn'=>'aum_filename',
75
+ 'audu'=>'dump',
76
+ 'i' =>'ignore'
77
+ })
78
+ @trainers['aum_year_album_mini_desc'] = Trainer.new({
79
+ 'd'=>'date',
80
+ 'l'=>'location',
81
+ 's'=>'desc',
82
+ 'i'=>'ignore'
83
+ })
84
+ end
85
+
86
+ def parse_site(artist=nil,url=nil)
87
+ @artist = artist unless artist.nil?()
88
+ @url = url unless url.nil?()
89
+
90
+ # URLs that return 404 or are empty; fix by hand
91
+ exclude_urls = /
92
+ awakeningzone\.com\/Episode\.aspx\?EpisodeID\=|
93
+ www\.talkshoe\.com\/talkshoe\/web\/audioPop\.jsp\?episodeId\=
94
+ /ix
95
+
96
+ if @url =~ exclude_urls
97
+ log.warn("Excluding Album URL #{@url}")
98
+ return
99
+ end
100
+
101
+ @trainers.load_file()
102
+
103
+ raise ArgumentError,"Artist cannot be nil" if @artist.nil?()
104
+ raise ArgumentError,"URL cannot be empty" if @url.nil?() || (@url = @url.strip()).empty?()
105
+
106
+ # Album data (flags are okay) should never go in this, only for aums, pics, etc.
107
+ @local_dump = {
108
+ :album_dates=>false,
109
+ :album_title=>false,
110
+ :album_locations=>false,
111
+ :album_mini_desc=>false,
112
+ :album_main_desc=>false,
113
+ :aums=>0,
114
+ :aum_subtitle=>[],
115
+ :aum_languages=>[],
116
+ :aum_title=>[],
117
+ :aum_timespan=>[],
118
+ :aum_filesize=>[],
119
+ :aum_filename=>[]
120
+ }
121
+
122
+ # Force 'utf-8'
123
+ # - See charset "X-MAC-ROMAN" in 2017 "The Discovery Series", 2016 "Kryon in Budapest (5)"
124
+ doc = Nokogiri::HTML(open(@url),nil,'utf-8')
125
+
126
+ old_album = @artist.albums[@url]
127
+
128
+ @album = old_album.clone()
129
+ @album.updated_on = @updated_on
130
+ @album.url = @url
131
+
132
+ if old_album.nil?()
133
+ @artist.albums[@url] = @album
134
+ end
135
+
136
+ parse_dump(doc,@album) # Must be first because other methods rely on @local_dump
137
+
138
+ return @album if @training # Currently, no other training occurs
139
+
140
+ parse_pics(doc,@album)
141
+ parse_aums(doc,@album)
142
+
143
+ if @album == old_album
144
+ @album.updated_on = old_album.updated_on
145
+ end
146
+
147
+ @artist.albums[@url] = @album
148
+
149
+ return @album
150
+ end
151
+
152
+ def parse_aums(doc,album)
153
+ links = doc.css('a')
154
+
155
+ return if links.nil?
156
+
157
+ i = 0 # Don't do #each_with_index() because sometimes we next
158
+
159
+ links.each do |link|
160
+ next if link.nil?
161
+
162
+ audio_file_regex = /\.mp3/i
163
+ href = link['href']
164
+ exclude_links = /
165
+ files\.kryonespanol\.com\/audio\/
166
+ /ix
167
+
168
+ next if href.nil? || href.empty?
169
+ next if href !~ audio_file_regex
170
+ next if href =~ exclude_links
171
+
172
+ aum = AumData.new
173
+ aum.url = Util.clean_data(href)
174
+ aum.filename = Util.parse_url_filename(aum.url)
175
+ aum.updated_on = @updated_on
176
+
177
+ if aum.url =~ /\A\.\.?\//
178
+ aum.url = Util.clean_link(@url,aum.url)
179
+ end
180
+
181
+ # Filesize
182
+ if !DevOpts.instance.test?()
183
+ # Getting header data is slow, so only do it when not testing
184
+ begin
185
+ r = Util.get_url_header_data(aum.url)
186
+ aum.filesize = r['content-length']
187
+ aum.filesize = aum.filesize[0] if aum.filesize.is_a?(Array)
188
+ rescue => e
189
+ raise e.exception("#{e.message}; couldn't get header data for #{aum.url}")
190
+ end
191
+ end
192
+
193
+ # Subtitle
194
+ if i < @local_dump[:aum_subtitle].length
195
+ aum.subtitle = @local_dump[:aum_subtitle][i]
196
+ else
197
+ log.warn("No subtitle for: #{aum.filename},#{aum.url}")
198
+ end
199
+
200
+ # Languages
201
+ aum.languages = @local_dump[:aum_languages][i] if i < @local_dump[:aum_languages].length
202
+
203
+ # Title
204
+ if i < @local_dump[:aum_title].length
205
+ aum.title = @local_dump[:aum_title][i]
206
+ else
207
+ # Set title to something at least
208
+ if !(afn = aum.filename).nil?() && !afn.strip().empty?()
209
+ # More descriptive than subtitle
210
+ aum.title = afn.gsub(audio_file_regex,'').strip()
211
+ log.warn("Using filename as title: #{aum.title}")
212
+ else
213
+ aum.title = aum.subtitle
214
+ log.warn("Using subtitle as title: #{aum.title}")
215
+ end
216
+ end
217
+
218
+ # Timespan
219
+ if i < @local_dump[:aum_timespan].length
220
+ aum.timespan = @local_dump[:aum_timespan][i]
221
+ else
222
+ msg = "No timespan for: #{aum.title},#{aum.subtitle},#{aum.filename},#{aum.url}"
223
+
224
+ log.warn(msg)
225
+
226
+ #if DevOpts.instance.dev?()
227
+ # raise "#{msg}:\n#{@local_dump}\n#{album.dump}"
228
+ #else
229
+ # log.warn(msg)
230
+ #end
231
+ end
232
+
233
+ # Filesize, if not set
234
+ if (aum.filesize.nil?() || aum.filesize.strip().empty?) && i < @local_dump[:aum_filesize].length
235
+ aum.filesize = @local_dump[:aum_filesize][i]
236
+ log.warn("Using local dump filesize: #{aum.filesize}")
237
+ end
238
+
239
+ i += 1
240
+
241
+ # Is it old?
242
+ if album.aums.key?(aum.url) && aum == album.aums[aum.url]
243
+ aum.updated_on = album.aums[aum.url].updated_on
244
+ else # New
245
+ album.updated_on = @updated_on
246
+ end
247
+
248
+ album.aums[aum.url] = aum
249
+ end
250
+ end
251
+
252
+ def parse_dump(doc,album)
253
+ album.dump = []
254
+ tds = doc.css('td')
255
+
256
+ return if tds.nil?
257
+
258
+ filename_regex = /\.mp3[[:space:]]*\z/i
259
+ # 2017 "Petra, Jordan (5)" has a ":" in the megabytes cell
260
+ size_regex = /\A[[:space:]]*[[:digit:]]+(\.|\:|[[:digit:]]|[[:space:]])*megabytes[[:space:]]*\z/i
261
+ # 2017 "Monument Valley Tour (11)" has a "." in the minutes cell
262
+ # 2017 "SUMMER LIGHT CONFERENCE PANEL (1)" is a special case ("One hour 6 minutes - (66 minutes)")
263
+ time_regex = /
264
+ \A[[:space:]]*[[:digit:]]+(\:|\.|[[:digit:]]|[[:space:]])*(minutes|Min)[[:space:]]*\z|
265
+ \([[:space:]]*[[:digit:]]+[[:space:]]+minutes[[:space:]]*\)[[:space:]]*\z
266
+ /ix
267
+ # 2017 "KRYON INDIA-NEPAL TOUR PART 1 (10)" doesn't have the word "megabytes"
268
+ time_or_size_regex = /\A[[:space:]]*[[:digit:]]+(\:|\.|[[:digit:]]|[[:space:]])*\z/i
269
+ # 2015 ones have a lot of "13:12 Min - 15.9 megs"
270
+ time_and_size_regex = /\A[[:space:]]*[[:digit:]]+[\:\.][[:digit:]]+[[:space:]]+Min[[:space:]]+\-[[:space:]]+[[:digit:]]+\.?[[:digit:]]*[[:space:]]*megs/i
271
+
272
+ size_count = 0
273
+ time_count = 0
274
+
275
+ tds.each do |td|
276
+ next if td.nil?
277
+ next if td.content.nil?
278
+
279
+ orig_c = Util.clean_charset(td.content)
280
+ c = Util.clean_data(orig_c)
281
+
282
+ next if c.empty?
283
+ #if c =~ exclude_content_regex
284
+ # log.warn("Excluding content: #{c}")
285
+ # next
286
+ #end
287
+
288
+ add_to_dump = true
289
+
290
+ if c =~ time_regex
291
+ @local_dump[:aum_timespan].push(TimespanData.new(c).to_s())
292
+ add_to_dump = false
293
+ time_count += 1
294
+ elsif c =~ size_regex
295
+ @local_dump[:aum_filesize].push(c)
296
+ add_to_dump = false
297
+ size_count += 1
298
+ elsif c =~ time_or_size_regex
299
+ # Time is usually before size
300
+ if time_count == size_count
301
+ @local_dump[:aum_timespan].push(TimespanData.new(c).to_s())
302
+ time_count += 1
303
+ else
304
+ @local_dump[:aum_filesize].push(c)
305
+ size_count += 1
306
+ end
307
+
308
+ add_to_dump = false
309
+ elsif c =~ time_and_size_regex
310
+ time_and_size = c.split(/[[:space:]]*\-[[:space:]]*/) # Split on '-'
311
+
312
+ @local_dump[:aum_timespan].push(TimespanData.new(time_and_size[0]).to_s())
313
+ time_count += 1
314
+ @local_dump[:aum_filesize].push(time_and_size[1])
315
+ size_count += 1
316
+
317
+ add_to_dump = false
318
+ elsif c =~ filename_regex
319
+ @local_dump[:aums] += 1
320
+ add_to_dump = false
321
+ else
322
+ # Paragraphs
323
+ pars = orig_c.gsub(/\A[[:space:]]+/,'').gsub(/[[:space:]]+\z/,'')
324
+ pars = pars.split(/[\r\n\p{Zl}\p{Zp}]{2,}/)
325
+
326
+ pars.each() do |par|
327
+ par = par.gsub(/[[:blank:]]+/,' ').strip()
328
+ par = Util.fix_shortwith_text(par)
329
+
330
+ next if par.empty?()
331
+
332
+ if @training
333
+ if @trainers['aum_year_album'].train(par) == 'album_mini_desc'
334
+ par.split(/\n+/).each() do |p|
335
+ @trainers['aum_year_album_mini_desc'].train(p)
336
+ end
337
+ end
338
+ else
339
+ #has_header = @local_dump[:album_title] || @local_dump[:album_dates] ||
340
+ # @local_dump[:album_locations] || @local_dump[:album_mini_desc] || @local_dump[:album_main_desc]
341
+ has_header = true
342
+ tag = @trainers['aum_year_album'].tag(par)
343
+
344
+ # For 2017 "RETURN TO LEMURIA (7)"
345
+ if par =~ /\A[[:space:]]*MEDITATION[[:space:]]+-[[:space:]]+Kalei[[:space:]]+-[[:space:]]+John[[:space:]]+-[[:space:]]+Amber[[:space:]]*\z/i
346
+ tag = 'aum_title'
347
+ log.warn("Changing tag to aum_title: #{Util.clean_data(par)}")
348
+ end
349
+
350
+ case tag
351
+ when 'album_title'
352
+ if !@local_dump[:album_title]
353
+ @local_dump[:album_title] = true
354
+ end
355
+ when 'album_dates'
356
+ if !@local_dump[:album_dates]
357
+ @local_dump[:album_dates] = true
358
+ end
359
+ when 'album_locations'
360
+ if !@local_dump[:album_locations]
361
+ @local_dump[:album_locations] = true
362
+ end
363
+ when 'album_mini_desc'
364
+ par.split(/\n+/).each() do |p|
365
+ p = Util.clean_data(p)
366
+
367
+ if !p.empty?()
368
+ case @trainers['aum_year_album_mini_desc'].tag(p)
369
+ when 'desc'
370
+ if !@local_dump[:album_mini_desc]
371
+ @local_dump[:album_mini_desc] = true
372
+ album.mini_desc = p
373
+ else
374
+ album.mini_desc << ' | ' if !album.mini_desc.strip().empty?()
375
+ album.mini_desc << p
376
+ end
377
+ when 'ignore'
378
+ log.warn("Excluding mini desc content: #{p}")
379
+ end
380
+ end
381
+ end
382
+
383
+ add_to_dump = false
384
+ when 'album_main_desc'
385
+ if !@local_dump[:album_main_desc]
386
+ @local_dump[:album_main_desc] = true
387
+ album.main_desc = ''.dup()
388
+ else
389
+ album.main_desc << "\n\n" if !album.main_desc.strip().empty?()
390
+ end
391
+
392
+ par.split(/\n+/).each() do |p|
393
+ album.main_desc << Util.clean_data(p) << "\n"
394
+ end
395
+
396
+ album.main_desc = album.main_desc.strip() # Remove last newline
397
+ add_to_dump = false
398
+ when 'ignore'
399
+ log.warn("Excluding content: #{Util.clean_data(par)}")
400
+ add_to_dump = false
401
+ else
402
+ if !has_header
403
+ log.warn("No header yet so ignoring: #{Util.clean_data(par)}")
404
+ else
405
+ case tag
406
+ when 'aum_subtitle'
407
+ @local_dump[:aum_subtitle].push(Util.clean_data(par))
408
+ add_to_dump = false
409
+ when 'aum_languages'
410
+ p = Util.clean_data(par)
411
+ @local_dump[:aum_languages].push(Iso.languages.find_by_kryon(p))
412
+ @local_dump[:aum_subtitle].push(p)
413
+ add_to_dump = false
414
+ when 'aum_title'
415
+ @local_dump[:aum_title].push(Util.clean_data(par))
416
+
417
+ # Special case for 2017 "LISBON, PORTUGAL (Fatima Tour) (3)"
418
+ if par =~ /\A[[:space:]]*Lisbon[[:space:]]+Channeling[[:space:]]+1[[:space:]]*\z/i
419
+ @local_dump[:aum_title].push('Lisbon Channeling 2');
420
+ @local_dump[:aum_title].push('Lisbon Channeling 3');
421
+ log.warn("Adding aum_titles for: #{Util.clean_data(par)}")
422
+ end
423
+ # For 2017 "KRYON INDIA-NEPAL TOUR PART 1 (10)" & "KRYON INDIA-NEPAL TOUR PART 2 (8)"
424
+ if par =~ /\A[[:space:]]*PAGE[[:space:]]*(ONE|TWO)[[:space:]]*\z/i
425
+ p = @local_dump[:aum_title].pop()
426
+ log.warn("Ignoring aum title: #{p}")
427
+ end
428
+
429
+ add_to_dump = false
430
+ when 'aum_filename'
431
+ add_to_dump = false
432
+ end
433
+ end
434
+ end
435
+ end
436
+ end
437
+ end
438
+
439
+ if add_to_dump
440
+ album.dump.push(c)
441
+
442
+ # For now, don't do this; if the font size is big, it's bad for mobile anyway
443
+ #album.dump.push(Util.clean_data(td.to_s())) # For bold, etc. html
444
+ end
445
+ end
446
+ end
447
+
448
+ def parse_pics(doc,album)
449
+ imgs = doc.css('img')
450
+
451
+ return if imgs.nil?
452
+
453
+ exclude_imgs = /
454
+ buttonMP3\.png|
455
+ freedownloadtype\.gif|
456
+ handani\.gif|
457
+ Kryonglobe\.jpg|
458
+ MP3\-download\.jpg|
459
+ MP3\-graphic\(SM\)\.jpg|
460
+ NavMenu\_AUDIOmaster\.png|
461
+ NavMenu\_master\.png|
462
+ testimonials\.png
463
+ /ix
464
+
465
+ imgs.each do |img|
466
+ next if img.nil?
467
+
468
+ src = img['src']
469
+
470
+ next if src.nil? || src.empty?
471
+ if src =~ exclude_imgs
472
+ log.warn("Excluding image: #{src}")
473
+ next
474
+ end
475
+
476
+ pic = PicData.new()
477
+
478
+ pic.url = Util.clean_link(url,src)
479
+ pic.filename = Util.parse_url_filename(pic.url)
480
+
481
+ pic.alt = img['alt']
482
+ pic.alt = '' if Util.empty_s?(pic.alt)
483
+ pic.caption = ''
484
+
485
+ pic.name = Util.empty_s?(pic.alt) ? File.basename(pic.filename,File.extname(pic.filename)) : pic.alt
486
+ pic.updated_on = @updated_on
487
+
488
+ # Is it old?
489
+ if album.pics.key?(pic.url) && pic == album.pics[pic.url]
490
+ pic.updated_on = album.pics[pic.url].updated_on
491
+ else # New
492
+ album.updated_on = @updated_on
493
+ end
494
+
495
+ album.pics[pic.url] = pic
496
+ end
497
+ end
498
+ end
499
+ end