webscraper_framework 0.1.733 → 0.1.734
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/wsfr +0 -1
- data/lib/webscraper_framework/application.rb +2 -16
- data/lib/webscraper_framework/base.rb +0 -3
- data/lib/webscraper_framework/base_model.rb +8 -5
- data/lib/webscraper_framework/page.rb +37 -7
- data/lib/webscraper_framework.rb +0 -3
- data/views/home.html.haml +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f6c75efc4c8371716d412d938322a36b7f3c4ca6
|
4
|
+
data.tar.gz: 6f65b45cf58f197c7d6c46353fe48c4bd8ff15e1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 71639b2805835f624d9b3e4ba346b4feaf5e4178f3ccc0e2543188e2e7eca6b8cd7393db5cbe3bbfb6fc35626a9a7b17018786bdd8a18f619c3cb8205af17466
|
7
|
+
data.tar.gz: 8ce6f7bee7cf53a6f9aed06ff70f509f55724bb3638cf0f782915b87a2b903579044bf5c87035d579a30eab2a9bc064a793ffb97784fbf5f7146d17422310d3e
|
data/bin/wsfr
CHANGED
@@ -14,7 +14,6 @@ require "sinatra/base"
|
|
14
14
|
module WebscraperFramework
|
15
15
|
class Application < Sinatra::Base
|
16
16
|
|
17
|
-
|
18
17
|
def render_file(file, params= {})
|
19
18
|
gemdir = Gem.loaded_specs["webscraper_framework"].gem_dir
|
20
19
|
base_template = File.read(gemdir + "/views/base.html.haml")
|
@@ -22,23 +21,10 @@ module WebscraperFramework
|
|
22
21
|
return Haml::Engine.new(base_template).render(Object.new, content: content, models: @@models, scrapers: @@scrapers)
|
23
22
|
end
|
24
23
|
|
25
|
-
|
26
|
-
|
27
24
|
model_files = Dir["./models/*"]
|
28
25
|
helper_files = Dir["./helpers/*"]
|
29
26
|
scraper_files = Dir["./scrapers/*"]
|
30
27
|
|
31
|
-
|
32
|
-
puts "-"
|
33
|
-
puts Dir["./"]
|
34
|
-
puts model_files
|
35
|
-
puts helper_files
|
36
|
-
puts scraper_files
|
37
|
-
puts "-"
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
28
|
data = {}
|
43
29
|
|
44
30
|
@@scrapers = []
|
@@ -98,7 +84,7 @@ module WebscraperFramework
|
|
98
84
|
csv << [id] + m.fields.map {|a| obj[a]}
|
99
85
|
end
|
100
86
|
end
|
101
|
-
# render_file("
|
87
|
+
# render_file("", { })
|
102
88
|
end
|
103
89
|
|
104
90
|
get '/models/:name' do
|
@@ -106,7 +92,7 @@ module WebscraperFramework
|
|
106
92
|
end
|
107
93
|
|
108
94
|
|
109
|
-
puts @@models
|
95
|
+
puts "Loaded Models #{@@models}"
|
110
96
|
|
111
97
|
end
|
112
98
|
end
|
@@ -42,10 +42,13 @@ module WebscraperFramework
|
|
42
42
|
end
|
43
43
|
|
44
44
|
def self.all
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
45
|
+
begin
|
46
|
+
YAML::load_file("#{self.name}.yml")
|
47
|
+
YAML::load_file("#{self.name}.yml")[:collecion]
|
48
|
+
YAML::load_file("#{self.name}.yml")[:collection]
|
49
|
+
rescue
|
50
|
+
[]
|
51
|
+
end
|
49
52
|
end
|
50
53
|
|
51
54
|
def fields
|
@@ -60,7 +63,7 @@ module WebscraperFramework
|
|
60
63
|
collection = {} unless collection
|
61
64
|
collection[:collection] = {} unless collection[:collection]
|
62
65
|
collection[:collection][id.to_sym] = to_hash
|
63
|
-
File.open(filename, 'w') {|f| f.write collection.to_yaml }
|
66
|
+
File.open(filename, 'w') {|f| f.write collection.to_yaml }
|
64
67
|
end
|
65
68
|
end
|
66
69
|
|
@@ -1,43 +1,73 @@
|
|
1
1
|
require 'nokogiri'
|
2
|
+
|
2
3
|
module WebscraperFramework
|
3
4
|
|
4
5
|
class Page
|
5
6
|
|
6
7
|
attr_accessor :html
|
7
8
|
|
9
|
+
# Need better flow for what this does.
|
10
|
+
|
11
|
+
def try_css_attr(css, attr)
|
12
|
+
if element = self.try_css(css)
|
13
|
+
element.attr(attr)
|
14
|
+
else
|
15
|
+
nil
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def try_css_parent_attr(css, attr)
|
20
|
+
if element = try_css(css)
|
21
|
+
element.parent.attr(attr)
|
22
|
+
else
|
23
|
+
nil
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def try_css(css)
|
28
|
+
self.html.css(css).first
|
29
|
+
end
|
30
|
+
|
31
|
+
|
32
|
+
# Simply gets a webpage based on a url.
|
33
|
+
# from_cache = true (default) will take a cached version
|
34
|
+
# if it exists.
|
35
|
+
|
8
36
|
def self.get_page(url, from_cache = true)
|
9
37
|
url_hash = Digest::SHA256.hexdigest(url)
|
10
38
|
filename = "cache/#{url_hash}"
|
11
39
|
if from_cache && File.file?(filename)
|
12
|
-
result = open(filename)
|
40
|
+
result = open(filename).read
|
13
41
|
puts "Gotten #{filename} from cache"
|
14
42
|
else
|
15
|
-
result = open(url)
|
16
|
-
File.write(filename, result
|
43
|
+
result = open(url).read
|
44
|
+
File.write(filename, result)
|
17
45
|
puts "Written cache file #{filename}"
|
18
46
|
end
|
19
|
-
return result
|
47
|
+
return result
|
20
48
|
end
|
21
49
|
|
22
50
|
def initialize html: nil
|
23
51
|
self.html = html
|
24
52
|
end
|
25
53
|
|
26
|
-
#
|
54
|
+
# helpers for seamless initialisation no matter what starting point
|
55
|
+
|
27
56
|
def self.by_url(url)
|
28
57
|
self.new(html: Nokogiri::HTML(get_page(url)))
|
29
58
|
end
|
30
59
|
|
31
|
-
# helper for seamless initialisation no matter what starting point
|
32
60
|
def self.by_html_string(html_string)
|
33
61
|
self.new(html: Nokogiri::HTML(html_string))
|
34
62
|
end
|
35
63
|
|
36
|
-
# helper for seamless initialisation no matter what starting point
|
37
64
|
def self.by_html(html)
|
38
65
|
self.new(html: html)
|
39
66
|
end
|
40
67
|
|
68
|
+
|
69
|
+
# Returns a collection of pages based on a selector.
|
70
|
+
# Use to collect a collection of elements from a page.
|
41
71
|
def collection_by_selector(selector)
|
42
72
|
self.html.css(selector).map{|item| WebscraperFramework::Page.by_html(item)}
|
43
73
|
end
|
data/lib/webscraper_framework.rb
CHANGED
data/views/home.html.haml
CHANGED
@@ -1 +1 @@
|
|
1
|
-
%p
|
1
|
+
%p
|