webscraper_framework 0.1.723 → 0.1.724
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/webscraper_framework.rb +1 -2
- data/lib/webscraper_framework/page.rb +46 -0
- metadata +2 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 242e3ed4c356854b10936f265d2dc8da7996b6b0
|
4
|
+
data.tar.gz: 7bec4b43bddbd925338b51f74bb8900a0639f80f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d36f595bf54d918a1e62ad32127b4a4ee4ebb2709f6b22d89c8544958b7739f245c9ccea111f7c4a8f5448040385989a5606230a7176fb7af4dbdbce27afbd3a
|
7
|
+
data.tar.gz: 5140c0a076918482138e53589b9b418899fc4167905920b8ae13c776a69a7e6ffe4b8e4e0e27863bcbe5a63e91ef8745dde567df941ff145e83e5261dd79b99c
|
data/lib/webscraper_framework.rb
CHANGED
@@ -1,7 +1,5 @@
|
|
1
|
-
|
2
1
|
module WebscraperFramework
|
3
2
|
|
4
|
-
|
5
3
|
end
|
6
4
|
|
7
5
|
|
@@ -9,3 +7,4 @@ require "webscraper_framework/application"
|
|
9
7
|
require "webscraper_framework/base"
|
10
8
|
require "webscraper_framework/base_model"
|
11
9
|
require "webscraper_framework/cli"
|
10
|
+
require "webscraper_framework/page"
|
@@ -0,0 +1,46 @@
|
|
1
|
+
module WebscraperFramework
|
2
|
+
|
3
|
+
class Page
|
4
|
+
|
5
|
+
attr_accessor :html
|
6
|
+
|
7
|
+
def self.get_page(url, from_cache = true)
|
8
|
+
url_hash = Digest::SHA256.hexdigest(url)
|
9
|
+
filename = "cache/#{url_hash}"
|
10
|
+
if from_cache && File.file?(filename)
|
11
|
+
result = open(filename)
|
12
|
+
puts "Gotten #{filename} from cache"
|
13
|
+
else
|
14
|
+
result = open(url)
|
15
|
+
File.write(filename, result.read)
|
16
|
+
puts "Written cache file #{filename}"
|
17
|
+
end
|
18
|
+
return result.read
|
19
|
+
end
|
20
|
+
|
21
|
+
def initialize html: nil
|
22
|
+
self.html = html
|
23
|
+
end
|
24
|
+
|
25
|
+
# helper for seamless initialisation no matter what starting point
|
26
|
+
def self.by_url(url)
|
27
|
+
self.new(html: Nokogiri::HTML(get_page(url)))
|
28
|
+
end
|
29
|
+
|
30
|
+
# helper for seamless initialisation no matter what starting point
|
31
|
+
def self.by_html_string(html_string)
|
32
|
+
self.new(html: Nokogiri::HTML(html_string))
|
33
|
+
end
|
34
|
+
|
35
|
+
# helper for seamless initialisation no matter what starting point
|
36
|
+
def self.by_html(html)
|
37
|
+
self.new(html: html)
|
38
|
+
end
|
39
|
+
|
40
|
+
def collection_by_selector(selector)
|
41
|
+
self.html.css(selector).map{|item| Page.by_html(item)}
|
42
|
+
end
|
43
|
+
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: webscraper_framework
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.724
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Rene van Pelt
|
@@ -108,6 +108,7 @@ files:
|
|
108
108
|
- lib/webscraper_framework/base.rb
|
109
109
|
- lib/webscraper_framework/base_model.rb
|
110
110
|
- lib/webscraper_framework/cli.rb
|
111
|
+
- lib/webscraper_framework/page.rb
|
111
112
|
- views/base.html.haml
|
112
113
|
- views/home.html.haml
|
113
114
|
- views/model.html.haml
|