proto 0.0.4 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/proto/scraper.rb +18 -25
- data/lib/proto/version.rb +1 -1
- metadata +1 -1
data/lib/proto/scraper.rb
CHANGED
@@ -3,7 +3,7 @@ module Proto
|
|
3
3
|
attr_accessor :url, :doc, :url_collection
|
4
4
|
|
5
5
|
def initialize(url)
|
6
|
-
@url = url.chomp '/'
|
6
|
+
@url = url.chomp '/' #remove trailing slash
|
7
7
|
@doc = Nokogiri::HTML(open(url))
|
8
8
|
end
|
9
9
|
|
@@ -15,45 +15,38 @@ module Proto
|
|
15
15
|
|
16
16
|
def fetch(name='Type', args)
|
17
17
|
if url_collection
|
18
|
-
attributes =
|
19
|
-
protos = create_return_objects(name, attributes)
|
20
|
-
return protos
|
18
|
+
attributes = scrape_multiple_pages(args)
|
21
19
|
else
|
22
|
-
attributes =
|
23
|
-
protos = create_return_objects(name, attributes)
|
24
|
-
return protos
|
20
|
+
attributes = scrape_single_page(args)
|
25
21
|
end
|
22
|
+
protos = create_return_objects(name, attributes)
|
23
|
+
return protos
|
26
24
|
end
|
27
25
|
alias_method :fetch_and_create!, :fetch
|
28
26
|
|
29
27
|
private
|
30
28
|
|
31
|
-
def
|
32
|
-
|
33
|
-
|
34
|
-
page = Nokogiri::HTML(open(url))
|
35
|
-
attrs_hash = gather_data(page, attributes)
|
36
|
-
hash_array << attrs_hash
|
29
|
+
def scrape_multiple_pages(attributes)
|
30
|
+
url_collection.each_with_object([]).map do |url, hash_array|
|
31
|
+
gather_data(url, attributes)
|
37
32
|
end
|
38
|
-
return hash_array
|
39
33
|
end
|
40
34
|
|
41
|
-
def gather_data(
|
42
|
-
|
35
|
+
def gather_data(url, attributes)
|
36
|
+
page = Nokogiri::HTML(open(url))
|
37
|
+
attributes.each_with_object({}) do |(key, selector), attrs|
|
43
38
|
attrs[key] = page.css(selector).text.strip
|
44
39
|
end
|
45
40
|
end
|
46
41
|
|
47
|
-
def
|
48
|
-
length_of_scrape =
|
42
|
+
def scrape_single_page(attributes)
|
43
|
+
length_of_scrape = doc.css(attributes.first[1]).count
|
49
44
|
|
50
|
-
|
51
|
-
attributes.inject(
|
52
|
-
hash.merge(attr_name =>
|
45
|
+
length_of_scrape.times.map do |index|
|
46
|
+
attributes.inject({}) do |hash, (attr_name, selector)|
|
47
|
+
hash.merge(attr_name => doc.css(selector)[index].text.strip) if doc.css(selector)[index]
|
53
48
|
end
|
54
|
-
end
|
55
|
-
|
56
|
-
final_array.compact
|
49
|
+
end.compact
|
57
50
|
end
|
58
51
|
|
59
52
|
def create_return_objects(name, attributes)
|
@@ -62,4 +55,4 @@ module Proto
|
|
62
55
|
attributes.map { |hash| Proto.const_get(name).new(hash) }
|
63
56
|
end
|
64
57
|
end
|
65
|
-
end
|
58
|
+
end
|
data/lib/proto/version.rb
CHANGED