ligamagic-scraper 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +318 -0
- data/Gemfile +4 -0
- data/LICENSE +22 -0
- data/README.md +614 -0
- data/Rakefile +121 -0
- data/bin/ligamagic-scraper +28 -0
- data/lib/ligamagic_scraper/alerts/alert_system.rb +218 -0
- data/lib/ligamagic_scraper/alerts/base_alert.rb +75 -0
- data/lib/ligamagic_scraper/alerts/file_alert.rb +56 -0
- data/lib/ligamagic_scraper/alerts/telegram_alert.rb +36 -0
- data/lib/ligamagic_scraper/cli.rb +152 -0
- data/lib/ligamagic_scraper/loggable.rb +43 -0
- data/lib/ligamagic_scraper/scrapers/base_scraper.rb +126 -0
- data/lib/ligamagic_scraper/scrapers/global_scraper.rb +240 -0
- data/lib/ligamagic_scraper/scrapers/store_scraper.rb +392 -0
- data/lib/ligamagic_scraper/version.rb +4 -0
- data/lib/ligamagic_scraper.rb +18 -0
- metadata +134 -0
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
module LigaMagicScraper
|
|
2
|
+
module Loggable
|
|
3
|
+
attr_reader :logs
|
|
4
|
+
|
|
5
|
+
def initialize_logs
|
|
6
|
+
@logs = []
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
def log(message, level: :info)
|
|
10
|
+
@logs << {
|
|
11
|
+
timestamp: Time.now,
|
|
12
|
+
level: level,
|
|
13
|
+
message: message,
|
|
14
|
+
source: self.class.name
|
|
15
|
+
}
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def log_info(message)
|
|
19
|
+
log(message, level: :info)
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def log_debug(message)
|
|
23
|
+
log(message, level: :debug)
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def log_warning(message)
|
|
27
|
+
log(message, level: :warning)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def log_error(message)
|
|
31
|
+
log(message, level: :error)
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def formatted_logs
|
|
35
|
+
@logs.map { |entry| entry[:message] }
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def clear_logs
|
|
39
|
+
@logs = []
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
module LigaMagicScraper
|
|
2
|
+
class BaseScraper
|
|
3
|
+
include Capybara::DSL
|
|
4
|
+
include Loggable
|
|
5
|
+
|
|
6
|
+
attr_reader :browser_mode, :alert_system
|
|
7
|
+
|
|
8
|
+
def initialize(browser_mode: 'headed', alert_config: nil)
|
|
9
|
+
@browser_mode = browser_mode
|
|
10
|
+
@alert_system = AlertSystem.new(alert_config || {}) if alert_config
|
|
11
|
+
initialize_logs
|
|
12
|
+
configure_browser
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def configure_browser
|
|
16
|
+
case browser_mode.downcase
|
|
17
|
+
when 'headless'
|
|
18
|
+
Capybara.register_driver :selenium_chrome_headless do |app|
|
|
19
|
+
options = Selenium::WebDriver::Chrome::Options.new
|
|
20
|
+
options.add_argument('--headless')
|
|
21
|
+
options.add_argument('--disable-gpu')
|
|
22
|
+
options.add_argument('--no-sandbox')
|
|
23
|
+
options.add_argument('--disable-dev-shm-usage')
|
|
24
|
+
|
|
25
|
+
Capybara::Selenium::Driver.new(app, browser: :chrome, options: options)
|
|
26
|
+
end
|
|
27
|
+
Capybara.default_driver = :selenium_chrome_headless
|
|
28
|
+
when 'headed'
|
|
29
|
+
Capybara.default_driver = :selenium_chrome
|
|
30
|
+
else
|
|
31
|
+
log_warning("⚠️ Unknown browser mode '#{browser_mode}', using default (headed)")
|
|
32
|
+
Capybara.default_driver = :selenium_chrome
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def generate_slug(name)
|
|
37
|
+
return nil if name.nil? || name.empty?
|
|
38
|
+
|
|
39
|
+
# Transliterate accented characters to ASCII
|
|
40
|
+
slug = name.downcase
|
|
41
|
+
.tr('áàãâäåāăąǎǟǡǻȁȃȧ', 'a')
|
|
42
|
+
.tr('éèêëēĕėęěȅȇȩ', 'e')
|
|
43
|
+
.tr('íìîïĩīĭįıȉȋ', 'i')
|
|
44
|
+
.tr('óòôõöōŏőơǒǿȍȏȫȭȯȱ', 'o')
|
|
45
|
+
.tr('úùûüũūŭůűųưȕȗ', 'u')
|
|
46
|
+
.tr('çćĉċč', 'c')
|
|
47
|
+
.tr('ñńņňʼn', 'n')
|
|
48
|
+
.gsub(/[^a-z0-9]+/, '_')
|
|
49
|
+
.gsub(/^_+|_+$/, '')
|
|
50
|
+
|
|
51
|
+
slug
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def parse_price(price_text)
|
|
55
|
+
return nil if price_text.nil? || price_text.empty?
|
|
56
|
+
|
|
57
|
+
price_text.gsub(/R\$\s*/, '')
|
|
58
|
+
.gsub(/\./, '')
|
|
59
|
+
.gsub(/,/, '.')
|
|
60
|
+
.to_f
|
|
61
|
+
rescue
|
|
62
|
+
nil
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def save_to_json(products)
|
|
66
|
+
filename = generate_filename
|
|
67
|
+
|
|
68
|
+
dir = File.dirname(filename)
|
|
69
|
+
FileUtils.mkdir_p(dir) unless Dir.exist?(dir)
|
|
70
|
+
|
|
71
|
+
data = build_json_data(products)
|
|
72
|
+
|
|
73
|
+
if @alert_system
|
|
74
|
+
previous_file = find_previous_scrape(filename)
|
|
75
|
+
@alert_system.process(current_data: data, previous_file:)
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
File.write(filename, JSON.pretty_generate(data))
|
|
79
|
+
log_info("💾 Results saved to: #{filename}")
|
|
80
|
+
filename
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
def find_previous_scrape(current_filename)
|
|
84
|
+
dir = File.dirname(current_filename)
|
|
85
|
+
|
|
86
|
+
# Extract the slug pattern from current filename (e.g., "__booster_box.json")
|
|
87
|
+
basename = File.basename(current_filename)
|
|
88
|
+
slug_pattern = basename.match(/__(.+)\.json$/)
|
|
89
|
+
|
|
90
|
+
return nil unless slug_pattern
|
|
91
|
+
|
|
92
|
+
slug = slug_pattern[1]
|
|
93
|
+
|
|
94
|
+
# Get all JSON files with the same slug pattern, sorted by name (chronological)
|
|
95
|
+
matching_files = Dir.glob(File.join(dir, "*__#{slug}.json")).sort.reverse
|
|
96
|
+
|
|
97
|
+
# Return the first file that's not the current one (most recent previous)
|
|
98
|
+
matching_files.find { |f| f != current_filename }
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
# Override in subclasses
|
|
102
|
+
def generate_filename
|
|
103
|
+
raise NotImplementedError, "Subclasses must implement generate_filename"
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
# Override in subclasses
|
|
107
|
+
def build_json_data(products)
|
|
108
|
+
raise NotImplementedError, "Subclasses must implement build_json_data"
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
# Override in subclasses
|
|
112
|
+
def scrape
|
|
113
|
+
raise NotImplementedError, "Subclasses must implement scrape"
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
def close_browser
|
|
117
|
+
log_info("🔒 Closing browser...")
|
|
118
|
+
begin
|
|
119
|
+
Capybara.current_session.driver.quit
|
|
120
|
+
rescue => e
|
|
121
|
+
log_error("⚠️ Error closing browser: #{e.message}")
|
|
122
|
+
end
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
end
|
|
126
|
+
|
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
require_relative 'base_scraper'
|
|
2
|
+
require 'nokogiri'
|
|
3
|
+
require 'benchmark'
|
|
4
|
+
require 'set'
|
|
5
|
+
|
|
6
|
+
module LigaMagicScraper
|
|
7
|
+
class GlobalScraper < BaseScraper
|
|
8
|
+
BASE_URL = 'https://www.ligamagic.com.br/?view=cards%2Fsearch&tipo=1'
|
|
9
|
+
MAX_CLICKS = 50
|
|
10
|
+
|
|
11
|
+
attr_reader :search_term
|
|
12
|
+
|
|
13
|
+
def initialize(search_term:, browser_mode: 'headed', alert_config: nil)
|
|
14
|
+
@search_term = search_term
|
|
15
|
+
@product_html_snapshots = []
|
|
16
|
+
@timings = {}
|
|
17
|
+
super(browser_mode:, alert_config:)
|
|
18
|
+
|
|
19
|
+
log_info("🚀 Starting Liga Magic global search scraper...")
|
|
20
|
+
log_info("🔍 Search term: #{search_term}")
|
|
21
|
+
log_info("🖥️ Browser mode: #{browser_mode}")
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def url
|
|
25
|
+
"#{BASE_URL}&card=#{CGI.escape(search_term)}"
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def scrape
|
|
29
|
+
start_time = Time.now
|
|
30
|
+
|
|
31
|
+
visit url
|
|
32
|
+
log_info("📄 Loaded initial page")
|
|
33
|
+
|
|
34
|
+
# Phase 1: Load all pages (browser open, no capturing)
|
|
35
|
+
load_time = Benchmark.measure do
|
|
36
|
+
load_all_products
|
|
37
|
+
end
|
|
38
|
+
@timings[:loading] = load_time.real
|
|
39
|
+
log_info("⏱️ Phase 1 (Loading): #{format('%.2f', load_time.real)}s")
|
|
40
|
+
|
|
41
|
+
# Phase 2: Capture all products at once (browser still open)
|
|
42
|
+
capture_time = Benchmark.measure do
|
|
43
|
+
capture_all_products
|
|
44
|
+
end
|
|
45
|
+
@timings[:capture] = capture_time.real
|
|
46
|
+
log_info("⏱️ Phase 2 (Capture): #{format('%.2f', capture_time.real)}s")
|
|
47
|
+
|
|
48
|
+
# Close browser ASAP
|
|
49
|
+
close_time = Benchmark.measure do
|
|
50
|
+
close_browser
|
|
51
|
+
end
|
|
52
|
+
@timings[:browser_close] = close_time.real
|
|
53
|
+
log_info("✅ Browser closed in #{format('%.2f', close_time.real)}s")
|
|
54
|
+
|
|
55
|
+
# Phase 3: Extract products from memory (browser closed)
|
|
56
|
+
log_info("🔍 Extracting products from memory...")
|
|
57
|
+
products = nil
|
|
58
|
+
extraction_time = Benchmark.measure do
|
|
59
|
+
products = extract_products
|
|
60
|
+
end
|
|
61
|
+
@timings[:extraction] = extraction_time.real
|
|
62
|
+
log_info("⏱️ Phase 3 (Extraction): #{format('%.2f', extraction_time.real)}s")
|
|
63
|
+
|
|
64
|
+
total_time = Time.now - start_time
|
|
65
|
+
@timings[:total] = total_time
|
|
66
|
+
|
|
67
|
+
log_info("=" * 60)
|
|
68
|
+
log_info("📊 PERFORMANCE SUMMARY")
|
|
69
|
+
log_info("=" * 60)
|
|
70
|
+
log_info("⏱️ Loading (browser open): #{format('%.2f', @timings[:loading])}s")
|
|
71
|
+
log_info("⏱️ Capture (browser open): #{format('%.2f', @timings[:capture])}s")
|
|
72
|
+
log_info("⏱️ Browser Close: #{format('%.2f', @timings[:browser_close])}s")
|
|
73
|
+
log_info("⏱️ Extraction (browser closed): #{format('%.2f', @timings[:extraction])}s")
|
|
74
|
+
log_info("⏱️ Total Time: #{format('%.2f', @timings[:total])}s")
|
|
75
|
+
log_info("📦 Products captured: #{@product_html_snapshots.count}")
|
|
76
|
+
log_info("✅ Valid products extracted: #{products.count}")
|
|
77
|
+
log_info("=" * 60)
|
|
78
|
+
|
|
79
|
+
products
|
|
80
|
+
rescue => e
|
|
81
|
+
log_error("❌ Error during scraping: #{e.message}")
|
|
82
|
+
log_debug(e.backtrace.first(5).join("\n"))
|
|
83
|
+
[]
|
|
84
|
+
ensure
|
|
85
|
+
close_browser if @driver
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
def generate_filename
|
|
89
|
+
datetime_str = Time.now.strftime('%Y%m%d_%H%M%S')
|
|
90
|
+
slug = search_term.downcase.gsub(/[^a-z0-9]+/, '_').gsub(/^_|_$/, '')
|
|
91
|
+
"scrapped/global/#{datetime_str}__#{slug}.json"
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
def build_json_data(products)
|
|
95
|
+
{
|
|
96
|
+
search_term:,
|
|
97
|
+
search_type: 'global',
|
|
98
|
+
scraped_at: Time.now.iso8601,
|
|
99
|
+
total_products: products.count,
|
|
100
|
+
products:
|
|
101
|
+
}
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
private
|
|
105
|
+
|
|
106
|
+
def load_all_products
|
|
107
|
+
log_info("🔄 Starting pagination...")
|
|
108
|
+
click_count = 0
|
|
109
|
+
|
|
110
|
+
loop do
|
|
111
|
+
if has_unavailable_products?
|
|
112
|
+
log_warning("⚠️ Found unavailable products, stopping pagination")
|
|
113
|
+
break
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
load_more_button = find_load_more_button
|
|
117
|
+
|
|
118
|
+
if load_more_button.nil?
|
|
119
|
+
log_info("✅ No more 'Load More' button found")
|
|
120
|
+
break
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
if click_count >= MAX_CLICKS
|
|
124
|
+
log_warning("⚠️ Reached maximum click limit (#{MAX_CLICKS})")
|
|
125
|
+
break
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
begin
|
|
129
|
+
load_more_button.click
|
|
130
|
+
click_count += 1
|
|
131
|
+
log_debug("🔄 Clicked 'Load More' button (#{click_count}x), waiting for new products...")
|
|
132
|
+
sleep 1
|
|
133
|
+
rescue => e
|
|
134
|
+
log_error("❌ Error clicking 'Load More': #{e.message}")
|
|
135
|
+
break
|
|
136
|
+
end
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
log_info("✅ Pagination complete. Total clicks: #{click_count}")
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
def has_unavailable_products?
|
|
143
|
+
page.has_text?(/indispon[ií]vel|esgotado/i, wait: 1)
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
def find_load_more_button
|
|
147
|
+
begin
|
|
148
|
+
element = page.find('input.exibir-mais[value="Exibir mais"]', visible: true, wait: 1)
|
|
149
|
+
return element if element
|
|
150
|
+
rescue Capybara::ElementNotFound
|
|
151
|
+
nil
|
|
152
|
+
end
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
def capture_all_products
|
|
156
|
+
log_info("📸 Capturing all products...")
|
|
157
|
+
# Find all product elements in one go and capture their outer HTML
|
|
158
|
+
product_elements = page.all('div.box.p25 div.mtg-single', wait: 5)
|
|
159
|
+
product_elements.each do |element|
|
|
160
|
+
@product_html_snapshots << element[:outerHTML]
|
|
161
|
+
end
|
|
162
|
+
log_info("📸 Captured #{@product_html_snapshots.count} product HTML snapshots")
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
def extract_products
|
|
166
|
+
log_info("🔍 Extracting product data from #{@product_html_snapshots.count} HTML snapshots...")
|
|
167
|
+
products = []
|
|
168
|
+
seen_ids = Set.new
|
|
169
|
+
|
|
170
|
+
@product_html_snapshots.each_with_index do |html, index|
|
|
171
|
+
log_debug(".") if (index + 1) % 10 == 0
|
|
172
|
+
|
|
173
|
+
begin
|
|
174
|
+
doc = Nokogiri::HTML(html)
|
|
175
|
+
|
|
176
|
+
# Check availability first
|
|
177
|
+
text_content = doc.text.downcase
|
|
178
|
+
next if text_content.match?(/indispon[ií]vel|esgotado|sem estoque/)
|
|
179
|
+
|
|
180
|
+
name_data = extract_product_name_and_id_from_doc(doc)
|
|
181
|
+
next unless name_data
|
|
182
|
+
|
|
183
|
+
# Skip duplicates (website shows same products on multiple pages)
|
|
184
|
+
next if seen_ids.include?(name_data[:id])
|
|
185
|
+
seen_ids.add(name_data[:id])
|
|
186
|
+
|
|
187
|
+
prices = extract_product_prices_from_doc(doc)
|
|
188
|
+
next unless prices
|
|
189
|
+
|
|
190
|
+
slug = generate_slug(name_data[:name])
|
|
191
|
+
|
|
192
|
+
products << {
|
|
193
|
+
id: name_data[:id],
|
|
194
|
+
slug:,
|
|
195
|
+
name: name_data[:name],
|
|
196
|
+
min_price: prices[:min_price],
|
|
197
|
+
avg_price: prices[:avg_price],
|
|
198
|
+
max_price: prices[:max_price]
|
|
199
|
+
}
|
|
200
|
+
rescue => e
|
|
201
|
+
log_warning("⚠️ Error extracting product #{index + 1}: #{e.message}")
|
|
202
|
+
end
|
|
203
|
+
end
|
|
204
|
+
|
|
205
|
+
log_info("✅ Extraction complete. Valid products: #{products.count} (from #{@product_html_snapshots.count} snapshots)")
|
|
206
|
+
products
|
|
207
|
+
end
|
|
208
|
+
|
|
209
|
+
def extract_product_name_and_id_from_doc(doc)
|
|
210
|
+
link = doc.at_css('.mtg-names .mtg-name-prod a')
|
|
211
|
+
return nil unless link
|
|
212
|
+
|
|
213
|
+
href = link['href']
|
|
214
|
+
match = href&.match(/pcode=(\d+)/)
|
|
215
|
+
|
|
216
|
+
name = link.text.strip
|
|
217
|
+
id = match ? match[1] : nil
|
|
218
|
+
|
|
219
|
+
return nil if name.empty?
|
|
220
|
+
|
|
221
|
+
{ name:, id: }
|
|
222
|
+
end
|
|
223
|
+
|
|
224
|
+
def extract_product_prices_from_doc(doc)
|
|
225
|
+
price_unit = doc.at_css('.mtg-prices .mtg-price-unit')
|
|
226
|
+
return nil unless price_unit
|
|
227
|
+
|
|
228
|
+
min_price_el = price_unit.at_css('.price-min')
|
|
229
|
+
avg_price_el = price_unit.at_css('.price-avg')
|
|
230
|
+
max_price_el = price_unit.at_css('.price-max')
|
|
231
|
+
|
|
232
|
+
{
|
|
233
|
+
min_price: parse_price(min_price_el&.text&.strip),
|
|
234
|
+
avg_price: parse_price(avg_price_el&.text&.strip),
|
|
235
|
+
max_price: parse_price(max_price_el&.text&.strip)
|
|
236
|
+
}
|
|
237
|
+
end
|
|
238
|
+
end
|
|
239
|
+
end
|
|
240
|
+
|