ligamagic-scraper 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,43 @@
1
+ module LigaMagicScraper
2
+ module Loggable
3
+ attr_reader :logs
4
+
5
+ def initialize_logs
6
+ @logs = []
7
+ end
8
+
9
+ def log(message, level: :info)
10
+ @logs << {
11
+ timestamp: Time.now,
12
+ level: level,
13
+ message: message,
14
+ source: self.class.name
15
+ }
16
+ end
17
+
18
+ def log_info(message)
19
+ log(message, level: :info)
20
+ end
21
+
22
+ def log_debug(message)
23
+ log(message, level: :debug)
24
+ end
25
+
26
+ def log_warning(message)
27
+ log(message, level: :warning)
28
+ end
29
+
30
+ def log_error(message)
31
+ log(message, level: :error)
32
+ end
33
+
34
+ def formatted_logs
35
+ @logs.map { |entry| entry[:message] }
36
+ end
37
+
38
+ def clear_logs
39
+ @logs = []
40
+ end
41
+ end
42
+ end
43
+
@@ -0,0 +1,126 @@
1
+ module LigaMagicScraper
2
+ class BaseScraper
3
+ include Capybara::DSL
4
+ include Loggable
5
+
6
+ attr_reader :browser_mode, :alert_system
7
+
8
+ def initialize(browser_mode: 'headed', alert_config: nil)
9
+ @browser_mode = browser_mode
10
+ @alert_system = AlertSystem.new(alert_config || {}) if alert_config
11
+ initialize_logs
12
+ configure_browser
13
+ end
14
+
15
+ def configure_browser
16
+ case browser_mode.downcase
17
+ when 'headless'
18
+ Capybara.register_driver :selenium_chrome_headless do |app|
19
+ options = Selenium::WebDriver::Chrome::Options.new
20
+ options.add_argument('--headless')
21
+ options.add_argument('--disable-gpu')
22
+ options.add_argument('--no-sandbox')
23
+ options.add_argument('--disable-dev-shm-usage')
24
+
25
+ Capybara::Selenium::Driver.new(app, browser: :chrome, options: options)
26
+ end
27
+ Capybara.default_driver = :selenium_chrome_headless
28
+ when 'headed'
29
+ Capybara.default_driver = :selenium_chrome
30
+ else
31
+ log_warning("⚠️ Unknown browser mode '#{browser_mode}', using default (headed)")
32
+ Capybara.default_driver = :selenium_chrome
33
+ end
34
+ end
35
+
36
+ def generate_slug(name)
37
+ return nil if name.nil? || name.empty?
38
+
39
+ # Transliterate accented characters to ASCII
40
+ slug = name.downcase
41
+ .tr('áàãâäåāăąǎǟǡǻȁȃȧ', 'a')
42
+ .tr('éèêëēĕėęěȅȇȩ', 'e')
43
+ .tr('íìîïĩīĭįıȉȋ', 'i')
44
+ .tr('óòôõöōŏőơǒǿȍȏȫȭȯȱ', 'o')
45
+ .tr('úùûüũūŭůűųưȕȗ', 'u')
46
+ .tr('çćĉċč', 'c')
47
+ .tr('ñńņňʼn', 'n')
48
+ .gsub(/[^a-z0-9]+/, '_')
49
+ .gsub(/^_+|_+$/, '')
50
+
51
+ slug
52
+ end
53
+
54
+ def parse_price(price_text)
55
+ return nil if price_text.nil? || price_text.empty?
56
+
57
+ price_text.gsub(/R\$\s*/, '')
58
+ .gsub(/\./, '')
59
+ .gsub(/,/, '.')
60
+ .to_f
61
+ rescue
62
+ nil
63
+ end
64
+
65
+ def save_to_json(products)
66
+ filename = generate_filename
67
+
68
+ dir = File.dirname(filename)
69
+ FileUtils.mkdir_p(dir) unless Dir.exist?(dir)
70
+
71
+ data = build_json_data(products)
72
+
73
+ if @alert_system
74
+ previous_file = find_previous_scrape(filename)
75
+ @alert_system.process(current_data: data, previous_file:)
76
+ end
77
+
78
+ File.write(filename, JSON.pretty_generate(data))
79
+ log_info("💾 Results saved to: #{filename}")
80
+ filename
81
+ end
82
+
83
+ def find_previous_scrape(current_filename)
84
+ dir = File.dirname(current_filename)
85
+
86
+ # Extract the slug pattern from current filename (e.g., "__booster_box.json")
87
+ basename = File.basename(current_filename)
88
+ slug_pattern = basename.match(/__(.+)\.json$/)
89
+
90
+ return nil unless slug_pattern
91
+
92
+ slug = slug_pattern[1]
93
+
94
+ # Get all JSON files with the same slug pattern, sorted by name (chronological)
95
+ matching_files = Dir.glob(File.join(dir, "*__#{slug}.json")).sort.reverse
96
+
97
+ # Return the first file that's not the current one (most recent previous)
98
+ matching_files.find { |f| f != current_filename }
99
+ end
100
+
101
+ # Override in subclasses
102
+ def generate_filename
103
+ raise NotImplementedError, "Subclasses must implement generate_filename"
104
+ end
105
+
106
+ # Override in subclasses
107
+ def build_json_data(products)
108
+ raise NotImplementedError, "Subclasses must implement build_json_data"
109
+ end
110
+
111
+ # Override in subclasses
112
+ def scrape
113
+ raise NotImplementedError, "Subclasses must implement scrape"
114
+ end
115
+
116
+ def close_browser
117
+ log_info("🔒 Closing browser...")
118
+ begin
119
+ Capybara.current_session.driver.quit
120
+ rescue => e
121
+ log_error("⚠️ Error closing browser: #{e.message}")
122
+ end
123
+ end
124
+ end
125
+ end
126
+
@@ -0,0 +1,240 @@
1
+ require_relative 'base_scraper'
2
+ require 'nokogiri'
3
+ require 'benchmark'
4
+ require 'set'
5
+
6
+ module LigaMagicScraper
7
+ class GlobalScraper < BaseScraper
8
+ BASE_URL = 'https://www.ligamagic.com.br/?view=cards%2Fsearch&tipo=1'
9
+ MAX_CLICKS = 50
10
+
11
+ attr_reader :search_term
12
+
13
+ def initialize(search_term:, browser_mode: 'headed', alert_config: nil)
14
+ @search_term = search_term
15
+ @product_html_snapshots = []
16
+ @timings = {}
17
+ super(browser_mode:, alert_config:)
18
+
19
+ log_info("🚀 Starting Liga Magic global search scraper...")
20
+ log_info("🔍 Search term: #{search_term}")
21
+ log_info("🖥️ Browser mode: #{browser_mode}")
22
+ end
23
+
24
+ def url
25
+ "#{BASE_URL}&card=#{CGI.escape(search_term)}"
26
+ end
27
+
28
+ def scrape
29
+ start_time = Time.now
30
+
31
+ visit url
32
+ log_info("📄 Loaded initial page")
33
+
34
+ # Phase 1: Load all pages (browser open, no capturing)
35
+ load_time = Benchmark.measure do
36
+ load_all_products
37
+ end
38
+ @timings[:loading] = load_time.real
39
+ log_info("⏱️ Phase 1 (Loading): #{format('%.2f', load_time.real)}s")
40
+
41
+ # Phase 2: Capture all products at once (browser still open)
42
+ capture_time = Benchmark.measure do
43
+ capture_all_products
44
+ end
45
+ @timings[:capture] = capture_time.real
46
+ log_info("⏱️ Phase 2 (Capture): #{format('%.2f', capture_time.real)}s")
47
+
48
+ # Close browser ASAP
49
+ close_time = Benchmark.measure do
50
+ close_browser
51
+ end
52
+ @timings[:browser_close] = close_time.real
53
+ log_info("✅ Browser closed in #{format('%.2f', close_time.real)}s")
54
+
55
+ # Phase 3: Extract products from memory (browser closed)
56
+ log_info("🔍 Extracting products from memory...")
57
+ products = nil
58
+ extraction_time = Benchmark.measure do
59
+ products = extract_products
60
+ end
61
+ @timings[:extraction] = extraction_time.real
62
+ log_info("⏱️ Phase 3 (Extraction): #{format('%.2f', extraction_time.real)}s")
63
+
64
+ total_time = Time.now - start_time
65
+ @timings[:total] = total_time
66
+
67
+ log_info("=" * 60)
68
+ log_info("📊 PERFORMANCE SUMMARY")
69
+ log_info("=" * 60)
70
+ log_info("⏱️ Loading (browser open): #{format('%.2f', @timings[:loading])}s")
71
+ log_info("⏱️ Capture (browser open): #{format('%.2f', @timings[:capture])}s")
72
+ log_info("⏱️ Browser Close: #{format('%.2f', @timings[:browser_close])}s")
73
+ log_info("⏱️ Extraction (browser closed): #{format('%.2f', @timings[:extraction])}s")
74
+ log_info("⏱️ Total Time: #{format('%.2f', @timings[:total])}s")
75
+ log_info("📦 Products captured: #{@product_html_snapshots.count}")
76
+ log_info("✅ Valid products extracted: #{products.count}")
77
+ log_info("=" * 60)
78
+
79
+ products
80
+ rescue => e
81
+ log_error("❌ Error during scraping: #{e.message}")
82
+ log_debug(e.backtrace.first(5).join("\n"))
83
+ []
84
+ ensure
85
+ close_browser if @driver
86
+ end
87
+
88
+ def generate_filename
89
+ datetime_str = Time.now.strftime('%Y%m%d_%H%M%S')
90
+ slug = search_term.downcase.gsub(/[^a-z0-9]+/, '_').gsub(/^_|_$/, '')
91
+ "scrapped/global/#{datetime_str}__#{slug}.json"
92
+ end
93
+
94
+ def build_json_data(products)
95
+ {
96
+ search_term:,
97
+ search_type: 'global',
98
+ scraped_at: Time.now.iso8601,
99
+ total_products: products.count,
100
+ products:
101
+ }
102
+ end
103
+
104
+ private
105
+
106
+ def load_all_products
107
+ log_info("🔄 Starting pagination...")
108
+ click_count = 0
109
+
110
+ loop do
111
+ if has_unavailable_products?
112
+ log_warning("⚠️ Found unavailable products, stopping pagination")
113
+ break
114
+ end
115
+
116
+ load_more_button = find_load_more_button
117
+
118
+ if load_more_button.nil?
119
+ log_info("✅ No more 'Load More' button found")
120
+ break
121
+ end
122
+
123
+ if click_count >= MAX_CLICKS
124
+ log_warning("⚠️ Reached maximum click limit (#{MAX_CLICKS})")
125
+ break
126
+ end
127
+
128
+ begin
129
+ load_more_button.click
130
+ click_count += 1
131
+ log_debug("🔄 Clicked 'Load More' button (#{click_count}x), waiting for new products...")
132
+ sleep 1
133
+ rescue => e
134
+ log_error("❌ Error clicking 'Load More': #{e.message}")
135
+ break
136
+ end
137
+ end
138
+
139
+ log_info("✅ Pagination complete. Total clicks: #{click_count}")
140
+ end
141
+
142
+ def has_unavailable_products?
143
+ page.has_text?(/indispon[ií]vel|esgotado/i, wait: 1)
144
+ end
145
+
146
+ def find_load_more_button
147
+ begin
148
+ element = page.find('input.exibir-mais[value="Exibir mais"]', visible: true, wait: 1)
149
+ return element if element
150
+ rescue Capybara::ElementNotFound
151
+ nil
152
+ end
153
+ end
154
+
155
+ def capture_all_products
156
+ log_info("📸 Capturing all products...")
157
+ # Find all product elements in one go and capture their outer HTML
158
+ product_elements = page.all('div.box.p25 div.mtg-single', wait: 5)
159
+ product_elements.each do |element|
160
+ @product_html_snapshots << element[:outerHTML]
161
+ end
162
+ log_info("📸 Captured #{@product_html_snapshots.count} product HTML snapshots")
163
+ end
164
+
165
+ def extract_products
166
+ log_info("🔍 Extracting product data from #{@product_html_snapshots.count} HTML snapshots...")
167
+ products = []
168
+ seen_ids = Set.new
169
+
170
+ @product_html_snapshots.each_with_index do |html, index|
171
+ log_debug(".") if (index + 1) % 10 == 0
172
+
173
+ begin
174
+ doc = Nokogiri::HTML(html)
175
+
176
+ # Check availability first
177
+ text_content = doc.text.downcase
178
+ next if text_content.match?(/indispon[ií]vel|esgotado|sem estoque/)
179
+
180
+ name_data = extract_product_name_and_id_from_doc(doc)
181
+ next unless name_data
182
+
183
+ # Skip duplicates (website shows same products on multiple pages)
184
+ next if seen_ids.include?(name_data[:id])
185
+ seen_ids.add(name_data[:id])
186
+
187
+ prices = extract_product_prices_from_doc(doc)
188
+ next unless prices
189
+
190
+ slug = generate_slug(name_data[:name])
191
+
192
+ products << {
193
+ id: name_data[:id],
194
+ slug:,
195
+ name: name_data[:name],
196
+ min_price: prices[:min_price],
197
+ avg_price: prices[:avg_price],
198
+ max_price: prices[:max_price]
199
+ }
200
+ rescue => e
201
+ log_warning("⚠️ Error extracting product #{index + 1}: #{e.message}")
202
+ end
203
+ end
204
+
205
+ log_info("✅ Extraction complete. Valid products: #{products.count} (from #{@product_html_snapshots.count} snapshots)")
206
+ products
207
+ end
208
+
209
+ def extract_product_name_and_id_from_doc(doc)
210
+ link = doc.at_css('.mtg-names .mtg-name-prod a')
211
+ return nil unless link
212
+
213
+ href = link['href']
214
+ match = href&.match(/pcode=(\d+)/)
215
+
216
+ name = link.text.strip
217
+ id = match ? match[1] : nil
218
+
219
+ return nil if name.empty?
220
+
221
+ { name:, id: }
222
+ end
223
+
224
+ def extract_product_prices_from_doc(doc)
225
+ price_unit = doc.at_css('.mtg-prices .mtg-price-unit')
226
+ return nil unless price_unit
227
+
228
+ min_price_el = price_unit.at_css('.price-min')
229
+ avg_price_el = price_unit.at_css('.price-avg')
230
+ max_price_el = price_unit.at_css('.price-max')
231
+
232
+ {
233
+ min_price: parse_price(min_price_el&.text&.strip),
234
+ avg_price: parse_price(avg_price_el&.text&.strip),
235
+ max_price: parse_price(max_price_el&.text&.strip)
236
+ }
237
+ end
238
+ end
239
+ end
240
+