webscraper_framework 0.1.733 → 0.1.734
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/wsfr +0 -1
- data/lib/webscraper_framework/application.rb +2 -16
- data/lib/webscraper_framework/base.rb +0 -3
- data/lib/webscraper_framework/base_model.rb +8 -5
- data/lib/webscraper_framework/page.rb +37 -7
- data/lib/webscraper_framework.rb +0 -3
- data/views/home.html.haml +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f6c75efc4c8371716d412d938322a36b7f3c4ca6
|
4
|
+
data.tar.gz: 6f65b45cf58f197c7d6c46353fe48c4bd8ff15e1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 71639b2805835f624d9b3e4ba346b4feaf5e4178f3ccc0e2543188e2e7eca6b8cd7393db5cbe3bbfb6fc35626a9a7b17018786bdd8a18f619c3cb8205af17466
|
7
|
+
data.tar.gz: 8ce6f7bee7cf53a6f9aed06ff70f509f55724bb3638cf0f782915b87a2b903579044bf5c87035d579a30eab2a9bc064a793ffb97784fbf5f7146d17422310d3e
|
data/bin/wsfr
CHANGED
@@ -14,7 +14,6 @@ require "sinatra/base"
|
|
14
14
|
module WebscraperFramework
|
15
15
|
class Application < Sinatra::Base
|
16
16
|
|
17
|
-
|
18
17
|
def render_file(file, params= {})
|
19
18
|
gemdir = Gem.loaded_specs["webscraper_framework"].gem_dir
|
20
19
|
base_template = File.read(gemdir + "/views/base.html.haml")
|
@@ -22,23 +21,10 @@ module WebscraperFramework
|
|
22
21
|
return Haml::Engine.new(base_template).render(Object.new, content: content, models: @@models, scrapers: @@scrapers)
|
23
22
|
end
|
24
23
|
|
25
|
-
|
26
|
-
|
27
24
|
model_files = Dir["./models/*"]
|
28
25
|
helper_files = Dir["./helpers/*"]
|
29
26
|
scraper_files = Dir["./scrapers/*"]
|
30
27
|
|
31
|
-
|
32
|
-
puts "-"
|
33
|
-
puts Dir["./"]
|
34
|
-
puts model_files
|
35
|
-
puts helper_files
|
36
|
-
puts scraper_files
|
37
|
-
puts "-"
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
28
|
data = {}
|
43
29
|
|
44
30
|
@@scrapers = []
|
@@ -98,7 +84,7 @@ module WebscraperFramework
|
|
98
84
|
csv << [id] + m.fields.map {|a| obj[a]}
|
99
85
|
end
|
100
86
|
end
|
101
|
-
# render_file("
|
87
|
+
# render_file("", { })
|
102
88
|
end
|
103
89
|
|
104
90
|
get '/models/:name' do
|
@@ -106,7 +92,7 @@ module WebscraperFramework
|
|
106
92
|
end
|
107
93
|
|
108
94
|
|
109
|
-
puts @@models
|
95
|
+
puts "Loaded Models #{@@models}"
|
110
96
|
|
111
97
|
end
|
112
98
|
end
|
@@ -42,10 +42,13 @@ module WebscraperFramework
|
|
42
42
|
end
|
43
43
|
|
44
44
|
def self.all
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
45
|
+
begin
|
46
|
+
YAML::load_file("#{self.name}.yml")
|
47
|
+
YAML::load_file("#{self.name}.yml")[:collecion]
|
48
|
+
YAML::load_file("#{self.name}.yml")[:collection]
|
49
|
+
rescue
|
50
|
+
[]
|
51
|
+
end
|
49
52
|
end
|
50
53
|
|
51
54
|
def fields
|
@@ -60,7 +63,7 @@ module WebscraperFramework
|
|
60
63
|
collection = {} unless collection
|
61
64
|
collection[:collection] = {} unless collection[:collection]
|
62
65
|
collection[:collection][id.to_sym] = to_hash
|
63
|
-
File.open(filename, 'w') {|f| f.write collection.to_yaml }
|
66
|
+
File.open(filename, 'w') {|f| f.write collection.to_yaml }
|
64
67
|
end
|
65
68
|
end
|
66
69
|
|
@@ -1,43 +1,73 @@
|
|
1
1
|
require 'nokogiri'
|
2
|
+
|
2
3
|
module WebscraperFramework
|
3
4
|
|
4
5
|
class Page
|
5
6
|
|
6
7
|
attr_accessor :html
|
7
8
|
|
9
|
+
# Need better flow for what this does.
|
10
|
+
|
11
|
+
def try_css_attr(css, attr)
|
12
|
+
if element = self.try_css(css)
|
13
|
+
element.attr(attr)
|
14
|
+
else
|
15
|
+
nil
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def try_css_parent_attr(css, attr)
|
20
|
+
if element = try_css(css)
|
21
|
+
element.parent.attr(attr)
|
22
|
+
else
|
23
|
+
nil
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def try_css(css)
|
28
|
+
self.html.css(css).first
|
29
|
+
end
|
30
|
+
|
31
|
+
|
32
|
+
# Simply gets a webpage based on a url.
|
33
|
+
# from_cache = true (default) will take a cached version
|
34
|
+
# if it exists.
|
35
|
+
|
8
36
|
def self.get_page(url, from_cache = true)
|
9
37
|
url_hash = Digest::SHA256.hexdigest(url)
|
10
38
|
filename = "cache/#{url_hash}"
|
11
39
|
if from_cache && File.file?(filename)
|
12
|
-
result = open(filename)
|
40
|
+
result = open(filename).read
|
13
41
|
puts "Gotten #{filename} from cache"
|
14
42
|
else
|
15
|
-
result = open(url)
|
16
|
-
File.write(filename, result
|
43
|
+
result = open(url).read
|
44
|
+
File.write(filename, result)
|
17
45
|
puts "Written cache file #{filename}"
|
18
46
|
end
|
19
|
-
return result
|
47
|
+
return result
|
20
48
|
end
|
21
49
|
|
22
50
|
def initialize html: nil
|
23
51
|
self.html = html
|
24
52
|
end
|
25
53
|
|
26
|
-
#
|
54
|
+
# helpers for seamless initialisation no matter what starting point
|
55
|
+
|
27
56
|
def self.by_url(url)
|
28
57
|
self.new(html: Nokogiri::HTML(get_page(url)))
|
29
58
|
end
|
30
59
|
|
31
|
-
# helper for seamless initialisation no matter what starting point
|
32
60
|
def self.by_html_string(html_string)
|
33
61
|
self.new(html: Nokogiri::HTML(html_string))
|
34
62
|
end
|
35
63
|
|
36
|
-
# helper for seamless initialisation no matter what starting point
|
37
64
|
def self.by_html(html)
|
38
65
|
self.new(html: html)
|
39
66
|
end
|
40
67
|
|
68
|
+
|
69
|
+
# Returns a collection of pages based on a selector.
|
70
|
+
# Use to collect a collection of elements from a page.
|
41
71
|
def collection_by_selector(selector)
|
42
72
|
self.html.css(selector).map{|item| WebscraperFramework::Page.by_html(item)}
|
43
73
|
end
|
data/lib/webscraper_framework.rb
CHANGED
data/views/home.html.haml
CHANGED
@@ -1 +1 @@
|
|
1
|
-
%p
|
1
|
+
%p
|