bandcamp-discover 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 82fa227242a4bf5508d6391e6791e2a6f45b041eb9b05c824b94fe4e5aa15cd0
4
+ data.tar.gz: d2e8480b5267e584a05d2af1c36d16ad7ea6d9af5416fe094c73c7405723a778
5
+ SHA512:
6
+ metadata.gz: 7e5ecead1e89a8a9995df2dc860bfe2db7e5935b876f7fb4119ec6821adecd9b9ba8a18834bfdbdb877cc2cd82449c1dad44b36d724b794733edd32271fbeb3a
7
+ data.tar.gz: bd904b22d2ae10650a8bed975df5e6c3caa7d1a64624dcb2d5a1da1dc568a52f77ba3717a45162e9d80845ef2f88315fe8e4a1daf72caf28a9c91261052dccbb
data/.gitignore ADDED
@@ -0,0 +1,3 @@
1
+ *~
2
+ *#*#
3
+ /pkg/
data/Gemfile ADDED
@@ -0,0 +1,2 @@
1
+ source 'https://rubygems.org'
2
+ gemspec
data/README.rdoc ADDED
@@ -0,0 +1,6 @@
1
+ = bandcamp-discover
2
+
3
+ Describe your project here
4
+
5
+ :include:bandcamp-discover.rdoc
6
+
data/Rakefile ADDED
@@ -0,0 +1,24 @@
1
+ require 'rake/clean'
2
+ require 'rubygems'
3
+ require 'rubygems/package_task'
4
+ require 'rdoc/task'
5
+
6
+ require "bundler/gem_tasks"
7
+
8
+ Rake::RDocTask.new do |rd|
9
+ rd.main = "README.rdoc"
10
+ rd.rdoc_files.include("README.rdoc","lib/**/*.rb","bin/**/*")
11
+ rd.title = 'Your application title'
12
+ end
13
+
14
+ spec = Gem::Specification.load("bandcamp-discover.gemspec")
15
+
16
+ Gem::PackageTask.new(spec) do |pkg|
17
+ end
18
+ require 'rake/testtask'
19
+ Rake::TestTask.new do |t|
20
+ t.libs << "test"
21
+ t.test_files = FileList['test/*_test.rb']
22
+ end
23
+
24
+ task :default => :test
@@ -0,0 +1,31 @@
1
+ # Ensure we require the local version and not one we might have installed already
2
+ require File.join([File.dirname(__FILE__),'lib','bandcamp-discover','version.rb'])
3
+ spec = Gem::Specification.new do |s|
4
+ s.name = 'bandcamp-discover'
5
+ s.version = BandcampDiscover::VERSION
6
+ s.author = 'Julian RUbisch'
7
+ s.email = 'julian@julianrubisch.at'
8
+ s.homepage = 'https://julianrubisch.at'
9
+ s.platform = Gem::Platform::RUBY
10
+ s.summary = 'Uses Playwright to scrape bandcamp labels'
11
+ s.files = `git ls-files`.split("
12
+ ")
13
+ s.require_paths << 'lib'
14
+ s.extra_rdoc_files = ['README.rdoc','bandcamp-discover.rdoc']
15
+ s.rdoc_options << '--title' << 'bandcamp-discover' << '--main' << 'README.rdoc' << '-ri'
16
+ s.bindir = 'bin'
17
+ s.executables << 'bandcamp-discover'
18
+ s.add_development_dependency('rake')
19
+ s.add_development_dependency('rdoc')
20
+ s.add_development_dependency('minitest')
21
+ s.add_development_dependency('standard')
22
+ s.add_runtime_dependency('gli','~> 2.21.5')
23
+
24
+ s.add_dependency('playwright-ruby-client')
25
+ s.add_dependency('sqlite3')
26
+ s.add_dependency('async')
27
+ s.add_dependency('concurrent-ruby')
28
+ s.add_dependency('base64')
29
+ s.add_dependency('logger')
30
+ s.add_dependency('ostruct')
31
+ end
@@ -0,0 +1,5 @@
1
+ = bandcamp-discover
2
+
3
+ Generate this with
4
+ bandcamp-discover _doc
5
+ After you have described your command line interface
@@ -0,0 +1,80 @@
1
+ #!/usr/bin/env ruby
2
+ require 'gli'
3
+ require 'bandcamp-discover'
4
+
5
+ class App
6
+ extend GLI::App
7
+
8
+ program_desc 'Describe your application here'
9
+
10
+ version BandcampDiscover::VERSION
11
+
12
+ subcommand_option_handling :normal
13
+ arguments :strict
14
+
15
+ desc 'Scrape bandcamp discover, labels, or albums'
16
+ # arg_name 'Describe arguments to scrape here'
17
+ command :scrape do |c|
18
+ c.desc 'Use a headless browser'
19
+ c.switch :headless, default_value: true
20
+
21
+ c.desc 'Maximum tasks for scraping a label'
22
+ c.flag :"max-tasks", default_value: 2
23
+
24
+ c.arg "genre"
25
+ c.command :discover do |discover|
26
+ discover.action do |global_options,options,args|
27
+ genre = args.shift
28
+ genre ||= "experimental"
29
+
30
+ Playwright.create(playwright_cli_executable_path: 'npx playwright') do |playwright|
31
+ playwright.chromium.launch(headless: options["headless"]) do |browser|
32
+ puts BandcampDiscover::Scrapers::Discover.new(genre:, browser:).scrape
33
+ end
34
+ end
35
+
36
+ # If you have any errors, just raise them
37
+ # raise "that command made no sense"
38
+ end
39
+ end
40
+
41
+ c.arg "url", required: true
42
+ c.command :label do |label|
43
+ label.action do |global_options,options,args|
44
+ url = args.shift
45
+
46
+ Playwright.create(playwright_cli_executable_path: 'npx playwright') do |playwright|
47
+ playwright.chromium.launch(headless: options["headless"]) do |browser|
48
+ puts BandcampDiscover::Scrapers::Label.new(url:, browser:, max_tasks: options.values.first["max-tasks"].to_i).scrape
49
+ end
50
+ end
51
+
52
+ # If you have any errors, just raise them
53
+ # raise "that command made no sense"
54
+ end
55
+ end
56
+ end
57
+
58
+ pre do |global,command,options,args|
59
+ # Pre logic here
60
+ # Return true to proceed; false to abort and not call the
61
+ # chosen command
62
+ # Use skips_pre before a command to skip this block
63
+ # on that command only
64
+ true
65
+ end
66
+
67
+ post do |global,command,options,args|
68
+ # Post logic here
69
+ # Use skips_post before a command to skip this
70
+ # block on that command only
71
+ end
72
+
73
+ on_error do |exception|
74
+ # Error logic here
75
+ # return false to skip default error handling
76
+ true
77
+ end
78
+ end
79
+
80
+ exit App.run(ARGV)
@@ -0,0 +1,18 @@
1
+ require_relative "./base"
2
+ require_relative "./music"
3
+
4
+ module BandcampDiscover
5
+ module Scrapers
6
+ class Album < Base
7
+ def scrape
8
+ super do |page|
9
+ page.goto(@url)
10
+
11
+ # querying all tags in the bottom and returning their text node
12
+ tags = page.query_selector_all("a.tag")
13
+ tags.map { _1.inner_text }
14
+ end
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,20 @@
1
+ require 'playwright'
2
+
3
+ module BandcampDiscover
4
+ module Scrapers
5
+ class Base
6
+ def initialize(url:, browser:, max_tasks: 2)
7
+ @url = url
8
+ @browser = browser
9
+ @page = browser.new_page
10
+ @max_tasks = max_tasks
11
+ end
12
+
13
+ def scrape
14
+ yield @page if block_given?
15
+ rescue Playwright::TimeoutError
16
+ puts "Failed to wait for element"
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,47 @@
1
+ require_relative "base"
2
+ require_relative "label"
3
+ require "uri"
4
+ require "async"
5
+ require "async/semaphore"
6
+ require "async/barrier"
7
+ require "concurrent"
8
+
9
+ module BandcampDiscover
10
+ module Scrapers
11
+ class Discover < Base
12
+ def initialize(genre:, browser:, max_tasks: Concurrent.processor_count)
13
+ super(url: "https://bandcamp.com/discover/#{genre}?s=rand", browser: browser, max_tasks: max_tasks)
14
+ end
15
+
16
+ def scrape
17
+ super do |page|
18
+ page.goto(@url)
19
+
20
+ records_list = page.wait_for_selector("ul.items")
21
+ records = records_list.query_selector_all("li")
22
+ links = records.map { _1.query_selector("a")[:href] }
23
+
24
+ # click "View more results"
25
+
26
+ uris = links.map { ::URI.parse(_1) }
27
+
28
+ barrier = Async::Barrier.new
29
+
30
+ Sync do
31
+ semaphore = Async::Semaphore.new(@max_tasks, parent: barrier)
32
+
33
+ uris.map do |uri|
34
+ url = "#{uri.scheme}://#{uri.host}"
35
+
36
+ semaphore.async do
37
+ Scrapers::Label.new(url: url, browser: @browser).scrape
38
+ end
39
+ end.map(&:wait).compact
40
+ ensure
41
+ barrier.stop
42
+ end
43
+ end
44
+ end
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,28 @@
1
+ require_relative "./base"
2
+ require_relative "./music"
3
+
4
+ module BandcampDiscover
5
+ module Scrapers
6
+ class Label < Base
7
+ def scrape
8
+ super do |page|
9
+ puts "starting to scrape #{@url}"
10
+ page.goto(@url)
11
+ bio_container = page.wait_for_selector("#bio-container")
12
+ bio_text = bio_container.query_selector("#bio-text")
13
+ if bio_text&.inner_html =~ /label|platform/
14
+ return Sync do
15
+ music_tags = Scrapers::Music.new(url: "#{@url}/music", browser: @browser, max_tasks: @max_tasks).scrape
16
+
17
+ puts "done scraping #{@url}"
18
+
19
+ [@url, music_tags]
20
+ end
21
+ else
22
+ puts "not a label: #{@url}"
23
+ end
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,48 @@
1
+ require_relative "base"
2
+ require_relative "album"
3
+ require "async"
4
+ require "async/semaphore"
5
+
6
+ module BandcampDiscover
7
+ module Scrapers
8
+ class Music < Base
9
+ def initialize(url:, browser:, max_tasks:)
10
+ super
11
+
12
+ uri = URI.parse(url)
13
+ @base_url = "#{uri.scheme}://#{uri.host}"
14
+ end
15
+
16
+ def scrape
17
+ super do |page|
18
+ page.goto(@url)
19
+ album_list = page.wait_for_selector("#music-grid")
20
+ album_links = album_list.query_selector_all("li.music-grid-item > a")
21
+
22
+ semaphore = Async::Semaphore.new(@max_tasks)
23
+
24
+ album_tags = album_links.take(20).map do |album_link|
25
+ semaphore.async do
26
+ url = "#{@base_url}#{album_link[:href]}"
27
+ puts "starting to scrape #{url}"
28
+
29
+ tags = Scrapers::Album.new(url: url, browser: @browser).scrape
30
+
31
+ puts "done scraping #{url}"
32
+
33
+ tags
34
+ end
35
+ end.map(&:wait)
36
+
37
+ normalize_tally(album_tags.flatten.tally)
38
+ end
39
+ end
40
+
41
+ def normalize_tally(tally)
42
+ total = tally.values.sum.to_f
43
+ tally.transform_values! { |count| count / total }
44
+ tally.sort_by { |k, v| v }.reverse.to_h
45
+ end
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,3 @@
1
+ module BandcampDiscover
2
+ VERSION = '0.1.0'
3
+ end
@@ -0,0 +1,7 @@
1
+ require 'async'
2
+ require 'bandcamp-discover/version.rb'
3
+ require 'bandcamp-discover/scrapers/discover.rb'
4
+
5
+ # Add requires for other files you add to your project here, so
6
+ # you just need to require this one file in your bin file
7
+
@@ -0,0 +1,14 @@
1
+ require_relative "test_helper"
2
+
3
+ class DefaultTest < Minitest::Test
4
+
5
+ def setup
6
+ end
7
+
8
+ def teardown
9
+ end
10
+
11
+ def test_the_truth
12
+ assert true
13
+ end
14
+ end
@@ -0,0 +1,4 @@
1
+ require "minitest/autorun"
2
+
3
+ # Add test libraries you want to use here, e.g. mocha
4
+ # Add helper classes or methods here, too
metadata ADDED
@@ -0,0 +1,230 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: bandcamp-discover
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Julian RUbisch
8
+ bindir: bin
9
+ cert_chain: []
10
+ date: 2025-02-19 00:00:00.000000000 Z
11
+ dependencies:
12
+ - !ruby/object:Gem::Dependency
13
+ name: rake
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - ">="
17
+ - !ruby/object:Gem::Version
18
+ version: '0'
19
+ type: :development
20
+ prerelease: false
21
+ version_requirements: !ruby/object:Gem::Requirement
22
+ requirements:
23
+ - - ">="
24
+ - !ruby/object:Gem::Version
25
+ version: '0'
26
+ - !ruby/object:Gem::Dependency
27
+ name: rdoc
28
+ requirement: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ type: :development
34
+ prerelease: false
35
+ version_requirements: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - ">="
38
+ - !ruby/object:Gem::Version
39
+ version: '0'
40
+ - !ruby/object:Gem::Dependency
41
+ name: minitest
42
+ requirement: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - ">="
45
+ - !ruby/object:Gem::Version
46
+ version: '0'
47
+ type: :development
48
+ prerelease: false
49
+ version_requirements: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ - !ruby/object:Gem::Dependency
55
+ name: standard
56
+ requirement: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ version: '0'
61
+ type: :development
62
+ prerelease: false
63
+ version_requirements: !ruby/object:Gem::Requirement
64
+ requirements:
65
+ - - ">="
66
+ - !ruby/object:Gem::Version
67
+ version: '0'
68
+ - !ruby/object:Gem::Dependency
69
+ name: gli
70
+ requirement: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - "~>"
73
+ - !ruby/object:Gem::Version
74
+ version: 2.21.5
75
+ type: :runtime
76
+ prerelease: false
77
+ version_requirements: !ruby/object:Gem::Requirement
78
+ requirements:
79
+ - - "~>"
80
+ - !ruby/object:Gem::Version
81
+ version: 2.21.5
82
+ - !ruby/object:Gem::Dependency
83
+ name: playwright-ruby-client
84
+ requirement: !ruby/object:Gem::Requirement
85
+ requirements:
86
+ - - ">="
87
+ - !ruby/object:Gem::Version
88
+ version: '0'
89
+ type: :runtime
90
+ prerelease: false
91
+ version_requirements: !ruby/object:Gem::Requirement
92
+ requirements:
93
+ - - ">="
94
+ - !ruby/object:Gem::Version
95
+ version: '0'
96
+ - !ruby/object:Gem::Dependency
97
+ name: sqlite3
98
+ requirement: !ruby/object:Gem::Requirement
99
+ requirements:
100
+ - - ">="
101
+ - !ruby/object:Gem::Version
102
+ version: '0'
103
+ type: :runtime
104
+ prerelease: false
105
+ version_requirements: !ruby/object:Gem::Requirement
106
+ requirements:
107
+ - - ">="
108
+ - !ruby/object:Gem::Version
109
+ version: '0'
110
+ - !ruby/object:Gem::Dependency
111
+ name: async
112
+ requirement: !ruby/object:Gem::Requirement
113
+ requirements:
114
+ - - ">="
115
+ - !ruby/object:Gem::Version
116
+ version: '0'
117
+ type: :runtime
118
+ prerelease: false
119
+ version_requirements: !ruby/object:Gem::Requirement
120
+ requirements:
121
+ - - ">="
122
+ - !ruby/object:Gem::Version
123
+ version: '0'
124
+ - !ruby/object:Gem::Dependency
125
+ name: concurrent-ruby
126
+ requirement: !ruby/object:Gem::Requirement
127
+ requirements:
128
+ - - ">="
129
+ - !ruby/object:Gem::Version
130
+ version: '0'
131
+ type: :runtime
132
+ prerelease: false
133
+ version_requirements: !ruby/object:Gem::Requirement
134
+ requirements:
135
+ - - ">="
136
+ - !ruby/object:Gem::Version
137
+ version: '0'
138
+ - !ruby/object:Gem::Dependency
139
+ name: base64
140
+ requirement: !ruby/object:Gem::Requirement
141
+ requirements:
142
+ - - ">="
143
+ - !ruby/object:Gem::Version
144
+ version: '0'
145
+ type: :runtime
146
+ prerelease: false
147
+ version_requirements: !ruby/object:Gem::Requirement
148
+ requirements:
149
+ - - ">="
150
+ - !ruby/object:Gem::Version
151
+ version: '0'
152
+ - !ruby/object:Gem::Dependency
153
+ name: logger
154
+ requirement: !ruby/object:Gem::Requirement
155
+ requirements:
156
+ - - ">="
157
+ - !ruby/object:Gem::Version
158
+ version: '0'
159
+ type: :runtime
160
+ prerelease: false
161
+ version_requirements: !ruby/object:Gem::Requirement
162
+ requirements:
163
+ - - ">="
164
+ - !ruby/object:Gem::Version
165
+ version: '0'
166
+ - !ruby/object:Gem::Dependency
167
+ name: ostruct
168
+ requirement: !ruby/object:Gem::Requirement
169
+ requirements:
170
+ - - ">="
171
+ - !ruby/object:Gem::Version
172
+ version: '0'
173
+ type: :runtime
174
+ prerelease: false
175
+ version_requirements: !ruby/object:Gem::Requirement
176
+ requirements:
177
+ - - ">="
178
+ - !ruby/object:Gem::Version
179
+ version: '0'
180
+ email: julian@julianrubisch.at
181
+ executables:
182
+ - bandcamp-discover
183
+ extensions: []
184
+ extra_rdoc_files:
185
+ - README.rdoc
186
+ - bandcamp-discover.rdoc
187
+ files:
188
+ - ".gitignore"
189
+ - Gemfile
190
+ - README.rdoc
191
+ - Rakefile
192
+ - bandcamp-discover.gemspec
193
+ - bandcamp-discover.rdoc
194
+ - bin/bandcamp-discover
195
+ - lib/bandcamp-discover.rb
196
+ - lib/bandcamp-discover/scrapers/album.rb
197
+ - lib/bandcamp-discover/scrapers/base.rb
198
+ - lib/bandcamp-discover/scrapers/discover.rb
199
+ - lib/bandcamp-discover/scrapers/label.rb
200
+ - lib/bandcamp-discover/scrapers/music.rb
201
+ - lib/bandcamp-discover/version.rb
202
+ - test/default_test.rb
203
+ - test/test_helper.rb
204
+ homepage: https://julianrubisch.at
205
+ licenses: []
206
+ metadata: {}
207
+ rdoc_options:
208
+ - "--title"
209
+ - bandcamp-discover
210
+ - "--main"
211
+ - README.rdoc
212
+ - "-ri"
213
+ require_paths:
214
+ - lib
215
+ - lib
216
+ required_ruby_version: !ruby/object:Gem::Requirement
217
+ requirements:
218
+ - - ">="
219
+ - !ruby/object:Gem::Version
220
+ version: '0'
221
+ required_rubygems_version: !ruby/object:Gem::Requirement
222
+ requirements:
223
+ - - ">="
224
+ - !ruby/object:Gem::Version
225
+ version: '0'
226
+ requirements: []
227
+ rubygems_version: 3.6.2
228
+ specification_version: 4
229
+ summary: Uses Playwright to scrape bandcamp labels
230
+ test_files: []