ligamagic-scraper 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,392 @@
1
+ require_relative 'base_scraper'
2
+ require 'nokogiri'
3
+ require 'benchmark'
4
+ require 'set'
5
+
6
+ module LigaMagicScraper
7
+ class StoreScraper < BaseScraper
8
+ attr_reader :store_domain, :store_url, :search_term, :max_pages
9
+
10
+ def initialize(store_domain:, search_term: nil, max_pages: nil, browser_mode: 'headed', alert_config: nil)
11
+ @store_domain = store_domain
12
+ @search_term = search_term
13
+ @max_pages = max_pages
14
+ @store_url = build_store_url(store_domain, search_term)
15
+ @product_html_snapshots = []
16
+ @timings = {}
17
+
18
+ # Validate: max_pages required when no search term
19
+ if search_term.nil? && max_pages.nil?
20
+ raise ArgumentError, "max_pages is required when listing store products without a search term"
21
+ end
22
+
23
+ super(browser_mode:, alert_config:)
24
+
25
+ log_info("🚀 Starting Liga Magic Store scraper...")
26
+ log_info("🏪 Store domain: #{store_domain}")
27
+ log_info("🔍 Search term: #{search_term}") if search_term
28
+ log_info("📄 Max pages: #{max_pages}") if max_pages
29
+ log_info("🔗 Store URL: #{@store_url}")
30
+ log_info("🖥️ Browser mode: #{browser_mode}")
31
+ end
32
+
33
+ def scrape
34
+ start_time = Time.now
35
+ log_info("📄 Starting pagination...")
36
+
37
+ # Phase 1: Load all pages and capture HTML (browser open)
38
+ load_time = Benchmark.measure do
39
+ load_all_pages
40
+ end
41
+ @timings[:loading_and_capture] = load_time.real
42
+ log_info("⏱️ Phase 1 (Loading + Capture): #{format('%.2f', load_time.real)}s")
43
+
44
+ # Close browser ASAP
45
+ close_time = Benchmark.measure do
46
+ close_browser
47
+ end
48
+ @timings[:browser_close] = close_time.real
49
+ log_info("✅ Browser closed in #{format('%.2f', close_time.real)}s")
50
+
51
+ # Phase 2: Extract products from memory (browser closed)
52
+ log_info("🔍 Extracting products from memory...")
53
+ all_products = nil
54
+ extraction_time = Benchmark.measure do
55
+ all_products = extract_products_from_html
56
+ end
57
+ @timings[:extraction] = extraction_time.real
58
+ log_info("⏱️ Phase 2 (Extraction): #{format('%.2f', extraction_time.real)}s")
59
+
60
+ total_time = Time.now - start_time
61
+ @timings[:total] = total_time
62
+
63
+ log_info("=" * 60)
64
+ log_info("📊 PERFORMANCE SUMMARY")
65
+ log_info("=" * 60)
66
+ log_info("⏱️ Loading + Capture (browser open): #{format('%.2f', @timings[:loading_and_capture])}s")
67
+ log_info("⏱️ Browser Close: #{format('%.2f', @timings[:browser_close])}s")
68
+ log_info("⏱️ Extraction (browser closed): #{format('%.2f', @timings[:extraction])}s")
69
+ log_info("⏱️ Total Time: #{format('%.2f', @timings[:total])}s")
70
+ log_info("📦 Products captured: #{@product_html_snapshots.count}")
71
+ log_info("✅ Valid products extracted: #{all_products.count}")
72
+ log_info("=" * 60)
73
+
74
+ all_products
75
+ rescue => e
76
+ log_error("❌ Error during scraping: #{e.message}")
77
+ log_debug(e.backtrace.first(5).join("\n"))
78
+ []
79
+ ensure
80
+ close_browser if @driver
81
+ end
82
+
83
+ def generate_filename
84
+ datetime_str = Time.now.strftime('%Y%m%d_%H%M%S')
85
+ store_slug = generate_slug(@store_domain)
86
+
87
+ if @search_term && !@search_term.empty?
88
+ search_slug = generate_slug(@search_term)
89
+ "scrapped/stores/#{store_slug}/#{datetime_str}__#{search_slug}.json"
90
+ else
91
+ "scrapped/stores/#{store_slug}/#{datetime_str}.json"
92
+ end
93
+ end
94
+
95
+ def build_json_data(products)
96
+ data = {
97
+ store_domain:,
98
+ store_url:,
99
+ search_type: 'store',
100
+ scraped_at: Time.now.iso8601,
101
+ total_products: products.count,
102
+ products:
103
+ }
104
+
105
+ data[:search_term] = search_term if search_term && !search_term.empty?
106
+ data[:max_pages] = max_pages if max_pages
107
+ data
108
+ end
109
+
110
+ private
111
+
112
+ # ============================================================================
113
+ # OBFUSCATED PRICE/QUANTITY EXTRACTION (Currently Not Implemented)
114
+ # ============================================================================
115
+ #
116
+ # When using store search with a search term (-u STORE -s TERM), Liga Magic
117
+ # employs sophisticated anti-scraping protection for prices and quantities:
118
+ #
119
+ # 1. CSS CLASS OBFUSCATION:
120
+ # - Digit values are encoded using randomized CSS class names
121
+ # - Example: <div class="qYlMh mImKn lJcCw">&nbsp;</div>
122
+ # - Classes change with each page load/session
123
+ #
124
+ # 2. SPRITE-BASED RENDERING:
125
+ # - Digits are rendered using CSS background-position from a sprite image
126
+ # - Example CSS: .lJcCw{background-position:-488px -2px;}
127
+ # - The sprite image URL also rotates per session
128
+ #
129
+ # 3. DYNAMIC MAPPING:
130
+ # - Both class names AND background positions change between sessions
131
+ # - Example: Session 1: .bPzEo{-216px -2px} → '0'
132
+ # Session 2: .nLwKv{-392px -65px} → '0'
133
+ # - No text content in DOM (innerText/textContent return empty strings)
134
+ #
135
+ # ATTEMPTED SOLUTIONS:
136
+ # - Static CSS mapping: Failed (classes/positions rotate)
137
+ # - JavaScript DOM extraction: Failed (no text content, purely visual)
138
+ # - CSS parsing: Partial (can extract mapping structure but positions change)
139
+ #
140
+ # POTENTIAL SOLUTIONS (Not Implemented):
141
+ # - Download sprite image and use OCR/image analysis (requires external gems)
142
+ # - Use image template matching at background-position coordinates
143
+ # - Analyze sprite pixel data to identify digits
144
+ #
145
+ # CURRENT BEHAVIOR:
146
+ # - Store listings (no search term): Price/qty extracted normally ✓
147
+ # - Store searches (with search term): Price/qty set to nil (cards extracted without pricing)
148
+ #
149
+ # ============================================================================
150
+
151
+ def extract_obfuscated_price(product)
152
+ # Placeholder for future implementation
153
+ # Would require sprite image download and analysis
154
+ nil
155
+ end
156
+
157
+ def extract_obfuscated_quantity(product)
158
+ # Placeholder for future implementation
159
+ # Would require sprite image download and analysis
160
+ nil
161
+ end
162
+
163
+
164
+ def build_store_url(domain, search_term = nil)
165
+ # Build URL: https://www.<domain>.com.br/?view=ecom/itens&tcg=1
166
+ base_domain = domain.include?('.') ? domain : "#{domain}.com.br"
167
+ base_domain = "www.#{base_domain}" unless base_domain.start_with?('www.')
168
+
169
+ url = "https://#{base_domain}/?view=ecom/itens"
170
+
171
+ if search_term && !search_term.empty?
172
+ # Add search parameter
173
+ url += "&busca=#{CGI.escape(search_term)}"
174
+ else
175
+ # Keep tcg=1 for non-search listing
176
+ url += "&tcg=1"
177
+ end
178
+
179
+ # Add ordering by price (most expensive to cheapest)
180
+ url += "&txt_order=6"
181
+
182
+ # Add filter for only in-stock items
183
+ url += "&txt_estoque=1"
184
+
185
+ url
186
+ end
187
+
188
+ def build_page_url(page_number)
189
+ return @store_url if page_number == 1
190
+
191
+ # Add page parameter
192
+ separator = @store_url.include?('?') ? '&' : '?'
193
+ "#{@store_url}#{separator}page=#{page_number}"
194
+ end
195
+
196
+ def should_continue_pagination?(current_page)
197
+ # If max_pages is set (no search term), check limit
198
+ if @max_pages
199
+ if current_page >= @max_pages
200
+ log_info("📄 Reached max pages limit (#{@max_pages})")
201
+ return false
202
+ end
203
+ end
204
+
205
+ # Check if there's a next page button
206
+ has_next_page?
207
+ end
208
+
209
+ def has_next_page?
210
+ # Look for next page link or button with &gt; (>)
211
+ # Use wait: 0 to avoid long waits when pagination doesn't exist
212
+ pagination_links = page.all('a.ecomresp-paginacao', wait: 0)
213
+
214
+ if pagination_links.empty?
215
+ log_info(" ℹ️ No pagination found (single page)")
216
+ return false
217
+ end
218
+
219
+ next_button = pagination_links.find do |link|
220
+ link.text.strip == '>' || link.text.include?('&gt;')
221
+ end
222
+
223
+ if next_button
224
+ log_debug(" ✓ Next page button found")
225
+ true
226
+ else
227
+ log_info(" ℹ️ No more pages available")
228
+ false
229
+ end
230
+ rescue => e
231
+ log_debug(" ⚠️ Error checking for next page: #{e.message}")
232
+ false
233
+ end
234
+
235
+ def load_all_pages
236
+ log_info("🔄 Loading all pages and capturing HTML...")
237
+ current_page = 1
238
+
239
+ loop do
240
+ log_info("📄 Loading page #{current_page}...")
241
+ page_url = build_page_url(current_page)
242
+ visit page_url
243
+
244
+ # Capture products from this page
245
+ product_elements = page.all('.card-item', wait: 3)
246
+ if product_elements.empty?
247
+ log_warning("⚠️ No products found on page #{current_page}")
248
+ break
249
+ end
250
+
251
+ # Capture HTML from this page
252
+ product_elements.each do |element|
253
+ @product_html_snapshots << element[:outerHTML]
254
+ end
255
+ log_info(" Captured #{product_elements.count} products from page #{current_page} (total: #{@product_html_snapshots.count})")
256
+
257
+ # Check if we should continue to next page
258
+ if should_continue_pagination?(current_page)
259
+ current_page += 1
260
+ sleep 1 # Be nice to the server
261
+ else
262
+ break
263
+ end
264
+ end
265
+
266
+ log_info("✅ Loading complete. Total pages: #{current_page}, Total products captured: #{@product_html_snapshots.count}")
267
+ end
268
+
269
+ def capture_all_products
270
+ # This method is now called from load_all_pages incrementally
271
+ # Keeping it here for compatibility but it won't be used in the optimized flow
272
+ log_info("📸 Capturing remaining products (if any)...")
273
+ product_elements = page.all('.card-item', wait: 1)
274
+ if product_elements.any?
275
+ product_elements.each do |element|
276
+ @product_html_snapshots << element[:outerHTML]
277
+ end
278
+ log_info("📸 Captured #{product_elements.count} additional products")
279
+ end
280
+ log_info("📸 Total snapshots: #{@product_html_snapshots.count}")
281
+ end
282
+
283
+ def extract_products_from_html
284
+ log_info("🔍 Extracting product data from #{@product_html_snapshots.count} HTML snapshots...")
285
+ products = []
286
+ seen_ids = Set.new
287
+
288
+ # Warn if using search term (price/qty won't be extracted)
289
+ if @search_term && !@search_term.empty?
290
+ log_warning("⚠️ Search term detected - price/qty extraction disabled (CSS obfuscation)")
291
+ end
292
+
293
+ @product_html_snapshots.each_with_index do |html, index|
294
+ log_debug(".") if (index + 1) % 10 == 0
295
+
296
+ begin
297
+ doc = Nokogiri::HTML(html)
298
+ product_data = extract_product_data_from_doc(doc)
299
+
300
+ if product_data
301
+ # Skip duplicates
302
+ next if seen_ids.include?(product_data[:card_id])
303
+ seen_ids.add(product_data[:card_id]) if product_data[:card_id]
304
+
305
+ products << product_data
306
+ end
307
+ rescue => e
308
+ log_warning("⚠️ Error extracting product #{index + 1}: #{e.message}")
309
+ end
310
+ end
311
+
312
+ log_info("✅ Extraction complete. Valid products: #{products.count} (from #{@product_html_snapshots.count} snapshots)")
313
+ products
314
+ end
315
+
316
+ def extract_product_data_from_doc(doc)
317
+ name = extract_product_name_from_doc(doc)
318
+
319
+ if name.nil? || name.empty?
320
+ log_debug(" ⚠️ Skipping product: no name found")
321
+ return nil
322
+ end
323
+
324
+ card_id = extract_card_id_from_doc(doc)
325
+
326
+ # Price and quantity extraction depends on search mode
327
+ if @search_term && !@search_term.empty?
328
+ # When search term is provided, Liga Magic uses CSS obfuscation for prices/quantities
329
+ price = nil
330
+ qtd = nil
331
+ available = nil
332
+ else
333
+ # Store listings (without search) have normal HTML structure
334
+ price = extract_product_price_from_doc(doc)
335
+ qtd = extract_quantity_from_doc(doc)
336
+ available = qtd && qtd > 0 # Available if has quantity
337
+ end
338
+
339
+ slug = generate_slug(name)
340
+
341
+ {card_id:, name:, slug:, price:, qtd:, available:}
342
+ end
343
+
344
+ def extract_card_id_from_doc(doc)
345
+ link = doc.at_css('.card-desc .title a')
346
+ return nil unless link
347
+
348
+ href = link['href']
349
+ return nil unless href
350
+
351
+ match = href.match(/[?&]card=(\d+)/)
352
+ match ? match[1] : nil
353
+ rescue => e
354
+ log_debug(" ⚠️ Error extracting card ID: #{e.message}")
355
+ nil
356
+ end
357
+
358
+ def extract_product_name_from_doc(doc)
359
+ link = doc.at_css('.card-desc .title a')
360
+ return nil unless link
361
+
362
+ link.text.strip
363
+ rescue => e
364
+ log_debug(" ⚠️ Error extracting name: #{e.message}")
365
+ nil
366
+ end
367
+
368
+ def extract_product_price_from_doc(doc)
369
+ # For store listings (no search term), prices are in plain text
370
+ price_element = doc.at_css('.card-desc .price .align-price')
371
+ return nil unless price_element
372
+
373
+ price_text = price_element.text.strip
374
+ parse_price(price_text)
375
+ rescue => e
376
+ log_debug(" ⚠️ Error extracting price: #{e.message}")
377
+ nil
378
+ end
379
+
380
+ def extract_quantity_from_doc(doc)
381
+ # For store listings (no search term), quantities are in plain text spans
382
+ qty_element = doc.at_css('.card-desc .qty span')
383
+ return nil unless qty_element
384
+
385
+ qty_text = qty_element.text.strip
386
+ qty_text.to_i
387
+ rescue => e
388
+ log_debug(" ⚠️ Error extracting quantity: #{e.message}")
389
+ nil
390
+ end
391
+ end
392
+ end
@@ -0,0 +1,4 @@
1
+ module LigaMagicScraper
2
+ VERSION = "0.6.0"
3
+ end
4
+
@@ -0,0 +1,18 @@
1
+ require 'capybara'
2
+ require 'capybara/dsl'
3
+ require 'selenium-webdriver'
4
+ require 'json'
5
+ require 'date'
6
+ require 'cgi'
7
+ require 'uri'
8
+ require 'fileutils'
9
+
10
+ require_relative 'ligamagic_scraper/version'
11
+ require_relative 'ligamagic_scraper/loggable'
12
+ require_relative 'ligamagic_scraper/scrapers/base_scraper'
13
+ require_relative 'ligamagic_scraper/scrapers/global_scraper'
14
+ require_relative 'ligamagic_scraper/scrapers/store_scraper'
15
+ require_relative 'ligamagic_scraper/alerts/alert_system'
16
+ require_relative 'ligamagic_scraper/cli'
17
+
18
+ Capybara.default_max_wait_time = 5
metadata ADDED
@@ -0,0 +1,134 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: ligamagic-scraper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.6.0
5
+ platform: ruby
6
+ authors:
7
+ - Vinicius Kammradt
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2025-11-24 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: capybara
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '3.40'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '3.40'
27
+ - !ruby/object:Gem::Dependency
28
+ name: selenium-webdriver
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '4.15'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '4.15'
41
+ - !ruby/object:Gem::Dependency
42
+ name: bundler
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '2.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '2.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rake
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '13.0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '13.0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: rspec
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '3.12'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '3.12'
83
+ description: A Ruby gem to scrape card prices and information from ligamagic.com.br
84
+ email:
85
+ - vinicius.kammradt1@gmail.com
86
+ executables:
87
+ - ligamagic-scraper
88
+ extensions: []
89
+ extra_rdoc_files: []
90
+ files:
91
+ - CHANGELOG.md
92
+ - Gemfile
93
+ - LICENSE
94
+ - README.md
95
+ - Rakefile
96
+ - bin/ligamagic-scraper
97
+ - lib/ligamagic_scraper.rb
98
+ - lib/ligamagic_scraper/alerts/alert_system.rb
99
+ - lib/ligamagic_scraper/alerts/base_alert.rb
100
+ - lib/ligamagic_scraper/alerts/file_alert.rb
101
+ - lib/ligamagic_scraper/alerts/telegram_alert.rb
102
+ - lib/ligamagic_scraper/cli.rb
103
+ - lib/ligamagic_scraper/loggable.rb
104
+ - lib/ligamagic_scraper/scrapers/base_scraper.rb
105
+ - lib/ligamagic_scraper/scrapers/global_scraper.rb
106
+ - lib/ligamagic_scraper/scrapers/store_scraper.rb
107
+ - lib/ligamagic_scraper/version.rb
108
+ homepage: https://github.com/kammradt/ligamagic-scrapper
109
+ licenses:
110
+ - MIT
111
+ metadata:
112
+ homepage_uri: https://github.com/kammradt/ligamagic-scrapper
113
+ source_code_uri: https://github.com/kammradt/ligamagic-scrapper
114
+ changelog_uri: https://github.com/kammradt/ligamagic-scrapper/blob/main/CHANGELOG.md
115
+ post_install_message:
116
+ rdoc_options: []
117
+ require_paths:
118
+ - lib
119
+ required_ruby_version: !ruby/object:Gem::Requirement
120
+ requirements:
121
+ - - ">="
122
+ - !ruby/object:Gem::Version
123
+ version: 2.7.0
124
+ required_rubygems_version: !ruby/object:Gem::Requirement
125
+ requirements:
126
+ - - ">="
127
+ - !ruby/object:Gem::Version
128
+ version: '0'
129
+ requirements: []
130
+ rubygems_version: 3.5.22
131
+ signing_key:
132
+ specification_version: 4
133
+ summary: A web scraper for Liga Magic product prices
134
+ test_files: []