getter_cyndi5 0.0.6 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/getter_cyndi5 +11 -7
- data/lib/getter_cyndi5.rb +25 -8
- data/lib/getter_cyndi5/parser.rb +24 -11
- data/lib/getter_cyndi5/product.rb +3 -7
- data/lib/getter_cyndi5/retriever.rb +17 -21
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 8897522fa8be7a71ee0a93636ad6c8250dc8ceb4b774e4682b6add74440e3561
|
|
4
|
+
data.tar.gz: 2f457d708b7c100b56a354101efe367299e6bdb97dbaa6c9f9b019b9f68b9318
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 5d627257e6e981258d321032a5959c0ddee38193606ae0d1f1538a2bebbc19b31c1570b477131ed29075b0c71cb1de9c1811eebf431c73d118f135689535383d
|
|
7
|
+
data.tar.gz: 3280ea73a619aea5be935a3fd7f2e622deecf7786e1d3967f3c22cc38e38678f804e21cc909e7e132791fe9ab5a69af36274f6b9010301113dcf1ae8525c0f23
|
data/bin/getter_cyndi5
CHANGED
|
@@ -1,10 +1,14 @@
|
|
|
1
1
|
#!/usr/bin/env ruby
|
|
2
2
|
|
|
3
3
|
require 'getter_cyndi5'
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
4
|
+
|
|
5
|
+
options = {
|
|
6
|
+
base_url: ARGV[0] || 'https://thehappyco.com',
|
|
7
|
+
products_page_path: ARGV[1] || '/kelly/products',
|
|
8
|
+
item_row_selector: ARGV[2] || '.item-row',
|
|
9
|
+
item_anchor_selector: ARGV[3] || 'div > div.product-desc.text-center > div.product-title > h3 > a',
|
|
10
|
+
item_price_selector: ARGV[4] || 'div > div.product-desc.text-center > div.product-price',
|
|
11
|
+
mode: (ARGV[7] || '2').to_i,
|
|
12
|
+
filename: ARGV[8] || './tmp/test_document.html'
|
|
13
|
+
}
|
|
14
|
+
GetterCyndi5.go(**options)
|
data/lib/getter_cyndi5.rb
CHANGED
|
@@ -3,26 +3,43 @@ class GetterCyndi5
|
|
|
3
3
|
# Getter
|
|
4
4
|
#
|
|
5
5
|
# Example:
|
|
6
|
-
# >> GetterCyndi5.go(
|
|
6
|
+
# >> GetterCyndi5.go(
|
|
7
|
+
# base_url = 'https://thehappyco.com',
|
|
8
|
+
# products_page_path = '/kelly/products',
|
|
9
|
+
# item_row_selector = '.item-row',
|
|
10
|
+
# item_anchor_selector = 'div > div.product-desc.text-center > div.product-title > h3 > a',
|
|
11
|
+
# item_price_selector = 'div > div.product-desc.text-center > div.product-price',
|
|
12
|
+
# price_a_text: 'One Time Purchase',
|
|
13
|
+
# price_b_text: 'SmartShip',
|
|
14
|
+
# mode = 2,
|
|
15
|
+
# filename = './tmp/test_document.html'
|
|
16
|
+
# )
|
|
7
17
|
# Arguments:
|
|
8
18
|
# base_url: (String)
|
|
9
19
|
# products_page_path: (String)
|
|
10
20
|
# item_row_selector: (String)
|
|
11
21
|
# item_anchor_selector: (String)
|
|
12
|
-
#
|
|
22
|
+
# item_price_selector: (String)
|
|
23
|
+
# price_a_text: (String)
|
|
24
|
+
# price_b_text: (String)
|
|
25
|
+
# mode: (Integer)
|
|
26
|
+
# 0 = retrieve and parse without saving HTML document to file,
|
|
27
|
+
# 1 = retrieve and parse saving HTML document to file,
|
|
28
|
+
# 2 = load and parse HTML document from file
|
|
13
29
|
# filename: (String)
|
|
14
30
|
|
|
15
|
-
def self.go(
|
|
16
|
-
all_products = products(
|
|
31
|
+
def self.go(options = {})
|
|
32
|
+
all_products = products(**options)
|
|
17
33
|
all_products.each do |product|
|
|
18
|
-
puts "#{product.
|
|
34
|
+
puts "#{product.inspect}"
|
|
19
35
|
end
|
|
20
36
|
end
|
|
21
37
|
|
|
22
|
-
def self.products(
|
|
23
|
-
retriever = Retriever.new(
|
|
38
|
+
def self.products(options = {})
|
|
39
|
+
retriever = Retriever.new(**options)
|
|
24
40
|
retriever.retrieve
|
|
25
|
-
|
|
41
|
+
|
|
42
|
+
parser = Parser.new(document: retriever.document, **options)
|
|
26
43
|
products = parser.parse
|
|
27
44
|
end
|
|
28
45
|
end
|
data/lib/getter_cyndi5/parser.rb
CHANGED
|
@@ -1,21 +1,34 @@
|
|
|
1
1
|
require 'getter_cyndi5/product'
|
|
2
2
|
class GetterCyndi5::Parser
|
|
3
|
+
DATA_PREFIX = 'data-'.freeze
|
|
3
4
|
attr_reader :products
|
|
4
|
-
def initialize(
|
|
5
|
-
@
|
|
6
|
-
@base_url = base_url
|
|
7
|
-
@item_row_selector = item_row_selector
|
|
8
|
-
@item_anchor_selector = item_anchor_selector
|
|
5
|
+
def initialize(options = {})
|
|
6
|
+
@options = options
|
|
9
7
|
@products = []
|
|
10
8
|
end
|
|
11
9
|
|
|
12
|
-
def parse
|
|
13
|
-
|
|
10
|
+
def parse
|
|
11
|
+
document = @options.fetch(:document)
|
|
12
|
+
base_url = @options.fetch(:base_url)
|
|
13
|
+
item_row_selector = @options.fetch(:item_row_selector)
|
|
14
|
+
item_anchor_selector = @options.fetch(:item_anchor_selector)
|
|
15
|
+
item_price_selector = @options.fetch(:item_price_selector)
|
|
16
|
+
item_rows = document.css(item_row_selector)
|
|
14
17
|
item_rows.each do |item_row|
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
18
|
+
item_data = item_row.attributes.select { |k, v| k.start_with? DATA_PREFIX }.map { |k, v| [k.delete_prefix(DATA_PREFIX), v.value]}.to_h
|
|
19
|
+
product_element = item_row.css(item_anchor_selector)[0]
|
|
20
|
+
price_elements = item_row.css(item_price_selector)
|
|
21
|
+
prices = {}
|
|
22
|
+
price_elements.each do |price_element|
|
|
23
|
+
prices[price_element.children[1].text] = price_element.children[0].text.gsub(/[^\d\.]/, '').to_f
|
|
24
|
+
end
|
|
25
|
+
attributes = {
|
|
26
|
+
name: product_element.text,
|
|
27
|
+
url: "#{base_url}#{product_element.attributes['href']}",
|
|
28
|
+
item_data: item_data,
|
|
29
|
+
prices: prices
|
|
30
|
+
}
|
|
31
|
+
product = GetterCyndi5::Product.new(attributes)
|
|
19
32
|
products.append(product)
|
|
20
33
|
end
|
|
21
34
|
products
|
|
@@ -1,10 +1,6 @@
|
|
|
1
1
|
class GetterCyndi5::Product
|
|
2
|
-
attr_accessor :
|
|
3
|
-
def initialize(
|
|
4
|
-
@
|
|
5
|
-
@url = url
|
|
6
|
-
@price_a = price_a
|
|
7
|
-
@price_b = price_b
|
|
8
|
-
@price_c = price_c
|
|
2
|
+
attr_accessor :attributes
|
|
3
|
+
def initialize(attributes = {})
|
|
4
|
+
@attributes = attributes
|
|
9
5
|
end
|
|
10
6
|
end
|
|
@@ -4,34 +4,30 @@ require 'watir'
|
|
|
4
4
|
require 'webdrivers'
|
|
5
5
|
|
|
6
6
|
class GetterCyndi5::Retriever
|
|
7
|
-
def initialize(
|
|
8
|
-
@
|
|
9
|
-
@products_page_path = products_page_path
|
|
10
|
-
@products_page_url = "#{@base_url}#{@products_page_path}"
|
|
11
|
-
@item_row_selector = item_row_selector
|
|
12
|
-
@mode = mode
|
|
13
|
-
@filename = filename
|
|
7
|
+
def initialize(options = {})
|
|
8
|
+
@options = options
|
|
14
9
|
end
|
|
15
10
|
|
|
11
|
+
attr_reader :document
|
|
12
|
+
attr_reader :item_row_elements
|
|
16
13
|
def retrieve()
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
browser.
|
|
14
|
+
products_page_url = "#{@options.fetch(:base_url)}#{@options.fetch(:products_page_path)}"
|
|
15
|
+
mode = @options.fetch(:mode)
|
|
16
|
+
if mode == 0 || mode == 1
|
|
17
|
+
browser = Watir::Browser.new :chrome, args: %w[--headless --no-sandbox --disable-dev-shm-usage --disable-gpu --remote-debugging-port=9222]
|
|
18
|
+
browser.goto(products_page_url)
|
|
19
|
+
@item_row_elements = browser.elements(css: @options.fetch(:item_row_selector))
|
|
22
20
|
end
|
|
23
|
-
if
|
|
24
|
-
File.write(@filename, browser.html)
|
|
21
|
+
if mode == 1
|
|
22
|
+
File.write(@options.fetch(:filename), browser.html)
|
|
23
|
+
browser.close
|
|
25
24
|
end
|
|
26
|
-
if
|
|
25
|
+
if mode == 0
|
|
27
26
|
@document = Nokogiri::HTML(browser.html)
|
|
27
|
+
browser.close
|
|
28
28
|
end
|
|
29
|
-
if
|
|
30
|
-
@document = File.open(@filename) { |f| Nokogiri::HTML(f) }
|
|
29
|
+
if mode == 1 || mode == 2
|
|
30
|
+
@document = File.open(@options.fetch(:filename)) { |f| Nokogiri::HTML(f) }
|
|
31
31
|
end
|
|
32
32
|
end
|
|
33
|
-
|
|
34
|
-
def document
|
|
35
|
-
@document
|
|
36
|
-
end
|
|
37
33
|
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: getter_cyndi5
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version:
|
|
4
|
+
version: 2.0.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Cyndi Cavanaugh
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2021-03-
|
|
11
|
+
date: 2021-03-14 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: httparty
|