artvee_scraper 0.0.1 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/artvee_scraper.rb +13 -48
- data/lib/card.rb +64 -0
- data/lib/http_fetcher.rb +14 -0
- metadata +5 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ad2436d422de3a69964170b72a3296a2f27062d05d395c217178ccac22c7fbed
|
4
|
+
data.tar.gz: d24d8a2d9533b58f9ccdfd8a92334f38950a5a49793cd837de42faa4a7fc0acb
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b43f433cf1462e3dd90cab43d4c53e80190199bc8d8beba89c1f049c5a4f7ff52ca487e19680dea54427c5dfd6959bd8db8c7cc00a0309afbdee80681712a370
|
7
|
+
data.tar.gz: 1870b93e70763b2b1992e623a891a3f5bff5e4d187c5dc6934cb10d005047000533de2937f08a7b1903a7c7ae3eb4a25beb8a668021156b587461f8d33bde9b6
|
data/lib/artvee_scraper.rb
CHANGED
@@ -1,13 +1,13 @@
|
|
1
|
-
# rubocop:disable Lint/MixedRegexpCaptureTypes
|
2
1
|
# frozen_string_literal: true
|
3
2
|
|
4
|
-
|
5
|
-
|
3
|
+
require_relative 'card'
|
4
|
+
require_relative 'http_fetcher'
|
6
5
|
|
6
|
+
# Scrapes art data from artvee.com
|
7
7
|
class ArtveeScraper
|
8
8
|
BASE_URL = 'https://artvee.com/'
|
9
9
|
@arts = []
|
10
|
-
@doc = Nokogiri::HTML(
|
10
|
+
@doc = ::Nokogiri::HTML(HttpFetcher.call(BASE_URL))
|
11
11
|
|
12
12
|
class << self
|
13
13
|
def scrape
|
@@ -18,55 +18,20 @@ class ArtveeScraper
|
|
18
18
|
private
|
19
19
|
|
20
20
|
def populate_arts
|
21
|
-
@doc.search('.product-grid-item.product.woodmart-hover-tiled').each do |
|
22
|
-
@arts <<
|
23
|
-
img_url: big_pic_url(card.at('img').attributes['src'].value),
|
24
|
-
title: title(card.at('h3').text),
|
25
|
-
date: date(card.at('h3').text),
|
26
|
-
artist: card.at('.woodmart-product-brands-links a')&.text,
|
27
|
-
artist_details: artist_details(card.at('.woodmart-product-brands-links').text),
|
28
|
-
tag: card.at('.woodmart-product-cats a')&.text
|
29
|
-
}
|
21
|
+
@doc.search('.product-grid-item.product.woodmart-hover-tiled').each do |obj|
|
22
|
+
@arts << art_hash(Card.new(obj))
|
30
23
|
end
|
31
24
|
end
|
32
25
|
|
33
|
-
def
|
34
|
-
original_url.sub(/ftmp/, 'sftb')
|
35
|
-
end
|
36
|
-
|
37
|
-
def title(h3_text)
|
38
|
-
h3_text[..-2].match(/^(?<title>.+?)\s*(\((?<date>[^)]+)\))?$/)[:title]
|
39
|
-
end
|
40
|
-
|
41
|
-
def date(h3_text)
|
42
|
-
h3_text[..-2].match(/^(?<title>.+?)\s*(\((?<date>[^)]+)\))?$/)[:date]
|
43
|
-
end
|
44
|
-
|
45
|
-
def artist_details(div_text)
|
46
|
-
return {} if div_text.split('(').count < 2
|
47
|
-
|
48
|
-
@details = div_text.split('(')[1][0..-2].split(', ')
|
49
|
-
author_life_cycle.merge(nationality)
|
50
|
-
end
|
51
|
-
|
52
|
-
def author_life_cycle
|
53
|
-
return { birth_date: @details.first } if @details.count == 1
|
54
|
-
return { birth_date: @details.last } if @details.last.delete(' ').split(/-|–/).count == 1
|
55
|
-
|
56
|
-
life_cycle_hash(@details.last.delete(' ').split(/-|–/))
|
57
|
-
end
|
58
|
-
|
59
|
-
def life_cycle_hash(life_cycle)
|
26
|
+
def art_hash(card)
|
60
27
|
{
|
61
|
-
|
62
|
-
|
28
|
+
img_url: card.img_url,
|
29
|
+
title: card.title,
|
30
|
+
date: card.date,
|
31
|
+
artist: card.artist,
|
32
|
+
artist_details: card.artist_details,
|
33
|
+
tag: card.tag
|
63
34
|
}
|
64
35
|
end
|
65
|
-
|
66
|
-
def nationality
|
67
|
-
return {} if @details.count == 1
|
68
|
-
|
69
|
-
{ nationality: @details.first }
|
70
|
-
end
|
71
36
|
end
|
72
37
|
end
|
data/lib/card.rb
ADDED
@@ -0,0 +1,64 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Helps the ArtveeScraper class to populate the @arts array
|
4
|
+
class Card
|
5
|
+
attr_reader :img_url, :title, :date, :artist, :artist_details, :tag
|
6
|
+
|
7
|
+
def initialize(html_obj)
|
8
|
+
@html_obj = html_obj
|
9
|
+
set_img_url
|
10
|
+
set_date_and_title
|
11
|
+
set_artist
|
12
|
+
set_artist_details
|
13
|
+
set_tag
|
14
|
+
end
|
15
|
+
|
16
|
+
private
|
17
|
+
|
18
|
+
def set_img_url
|
19
|
+
img_src = @html_obj.at('img').attributes['src'].value
|
20
|
+
@img_url = img_src.sub(/ftmp/, 'sftb')
|
21
|
+
end
|
22
|
+
|
23
|
+
def set_date_and_title
|
24
|
+
h3_text = @html_obj.at('h3').text[..-2]
|
25
|
+
date_title = h3_text.match(/^(?<title>.+?)\s*(\((?<date>[^)]+)\))?$/)
|
26
|
+
@title = date_title[:title]
|
27
|
+
@date = date_title[:date]
|
28
|
+
end
|
29
|
+
|
30
|
+
def set_artist
|
31
|
+
@artist = @html_obj.at('.woodmart-product-brands-links a')&.text
|
32
|
+
end
|
33
|
+
|
34
|
+
def set_artist_details
|
35
|
+
div_text = @html_obj.at('.woodmart-product-brands-links').text
|
36
|
+
return {} if div_text.split('(').count < 2
|
37
|
+
|
38
|
+
@details = div_text.split('(')[1][0..-2].split(', ')
|
39
|
+
@artist_details = author_life_cycle.merge(nationality)
|
40
|
+
end
|
41
|
+
|
42
|
+
def set_tag
|
43
|
+
@tag = @html_obj.at('.woodmart-product-cats a')&.text
|
44
|
+
end
|
45
|
+
|
46
|
+
def author_life_cycle
|
47
|
+
return {} unless @details.count >= 1 && @details.last.match?(/\d+/)
|
48
|
+
|
49
|
+
life_cycle_hash(@details.last.delete(' ').split(/-|–/))
|
50
|
+
end
|
51
|
+
|
52
|
+
def life_cycle_hash(life_cycle)
|
53
|
+
{
|
54
|
+
birth_date: life_cycle[0],
|
55
|
+
passing_date: life_cycle[1]
|
56
|
+
}
|
57
|
+
end
|
58
|
+
|
59
|
+
def nationality
|
60
|
+
return {} unless @details.first.match?(/^[a-zA-Z]{2,}/)
|
61
|
+
|
62
|
+
{ nationality: @details.first }
|
63
|
+
end
|
64
|
+
end
|
data/lib/http_fetcher.rb
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Fetches the content of a given URL and returns its body only if the HTTP request receives a 200 response.
|
4
|
+
class HttpFetcher
|
5
|
+
def self.call(url)
|
6
|
+
uri = ::URI.parse(url)
|
7
|
+
|
8
|
+
response = Net::HTTP.start(uri.host, uri.port, use_ssl: uri.scheme == 'https') do |http|
|
9
|
+
http.request(Net::HTTP::Get.new(uri.request_uri))
|
10
|
+
end
|
11
|
+
|
12
|
+
response.body if response.code == '200'
|
13
|
+
end
|
14
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: artvee_scraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Leon Siqueira
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-07-14 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: A gem that gets titles, dates, artist, image URLs, etc. and returns as
|
14
14
|
a Hash
|
@@ -18,6 +18,8 @@ extensions: []
|
|
18
18
|
extra_rdoc_files: []
|
19
19
|
files:
|
20
20
|
- lib/artvee_scraper.rb
|
21
|
+
- lib/card.rb
|
22
|
+
- lib/http_fetcher.rb
|
21
23
|
homepage: https://github.com/leon-siqueira/artvee-scraper
|
22
24
|
licenses:
|
23
25
|
- MIT
|
@@ -30,7 +32,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
30
32
|
requirements:
|
31
33
|
- - ">="
|
32
34
|
- !ruby/object:Gem::Version
|
33
|
-
version: '
|
35
|
+
version: '3.1'
|
34
36
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
35
37
|
requirements:
|
36
38
|
- - ">="
|