elsmore 0.1.7 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/hackference.co.uk/assets/css/site.css +488 -0
- data/hackference.co.uk/assets/css/slicknav.css +178 -0
- data/hackference.co.uk/assets/css/tito.css +130 -0
- data/hackference.co.uk/assets/img/algolia.png +0 -0
- data/hackference.co.uk/assets/img/andrew-faraday.jpg +0 -0
- data/hackference.co.uk/assets/img/conf.png +0 -0
- data/hackference.co.uk/assets/img/contentful.jpg +0 -0
- data/hackference.co.uk/assets/img/cristiano-betta.jpg +0 -0
- data/hackference.co.uk/assets/img/dan-jenkins.png +0 -0
- data/hackference.co.uk/assets/img/daniel-knell.jpg +0 -0
- data/hackference.co.uk/assets/img/etiene-dalcol.jpg +0 -0
- data/hackference.co.uk/assets/img/felienne-hermans.jpg +0 -0
- data/hackference.co.uk/assets/img/hack.png +0 -0
- data/hackference.co.uk/assets/img/hugh-rawlinson.jpg +0 -0
- data/hackference.co.uk/assets/img/improbable.png +0 -0
- data/hackference.co.uk/assets/img/jessica-rose-cartoon.png +0 -0
- data/hackference.co.uk/assets/img/jonathan-kingsley.jpeg +0 -0
- data/hackference.co.uk/assets/img/logo-small.png +0 -0
- data/hackference.co.uk/assets/img/martin-splitt.jpg +0 -0
- data/hackference.co.uk/assets/img/microsoft.png +0 -0
- data/hackference.co.uk/assets/img/mozilla.png +0 -0
- data/hackference.co.uk/assets/img/nexmo.png +0 -0
- data/hackference.co.uk/assets/img/packt.png +0 -0
- data/hackference.co.uk/assets/img/pebble.png +0 -0
- data/hackference.co.uk/assets/img/proactive.png +0 -0
- data/hackference.co.uk/assets/img/pusher.png +0 -0
- data/hackference.co.uk/assets/img/remy-sharp.jpg +0 -0
- data/hackference.co.uk/assets/img/sam-wierema.jpg +0 -0
- data/hackference.co.uk/assets/img/samathy-barratt.jpg +0 -0
- data/hackference.co.uk/assets/img/soledad.png +0 -0
- data/hackference.co.uk/assets/img/technical-team-solutions.png +0 -0
- data/hackference.co.uk/assets/img/terence-eden.png +0 -0
- data/hackference.co.uk/cdn-cgi/l/email-protection/index.html +75 -0
- data/hackference.co.uk/cdn-cgi/scripts/cf.common.js +78 -0
- data/hackference.co.uk/cdn-cgi/scripts/zepto.min.js +2 -0
- data/hackference.co.uk/cdn-cgi/styles/cf.errors.css +1 -0
- data/hackference.co.uk/cdnjs.cloudflare.com/ajax/libs/SlickNav/1.0.7/jquery.slicknav.min.js +6 -0
- data/hackference.co.uk/code-of-conduct.html +525 -0
- data/hackference.co.uk/code.jquery.com/jquery-1.12.3.min.js +5 -0
- data/hackference.co.uk/conference.html +776 -0
- data/hackference.co.uk/contact.html +365 -0
- data/hackference.co.uk/credits.html +399 -0
- data/hackference.co.uk/hackathon.html +585 -0
- data/hackference.co.uk/index.html +543 -0
- data/hackference.co.uk/js.tito.io/v1/index.html +1 -0
- data/hackference.co.uk/maxcdn.bootstrapcdn.com/bootstrap/3.3.6/css/bootstrap.min.css +6 -0
- data/hackference.co.uk/maxcdn.bootstrapcdn.com/font-awesome/4.3.0/css/font-awesome.min.css +4 -0
- data/hackference.co.uk/platform.twitter.com/js/tweet.00c2bb81d4f3f312a9a021c715a1c9dc.js +1 -0
- data/hackference.co.uk/platform.twitter.com/widgets.js +11 -0
- data/hackference.co.uk/speakers/andrew-faraday.html +363 -0
- data/hackference.co.uk/speakers/dan-jenkins.html +378 -0
- data/hackference.co.uk/speakers/daniel-knell.html +382 -0
- data/hackference.co.uk/speakers/etiene-dalcol.html +378 -0
- data/hackference.co.uk/speakers/felienne-hermans.html +386 -0
- data/hackference.co.uk/speakers/hugh-rawlinson.html +378 -0
- data/hackference.co.uk/speakers/jonathan-kingsley.html +386 -0
- data/hackference.co.uk/speakers/martin-splitt.html +402 -0
- data/hackference.co.uk/speakers/remy-sharp.html +380 -0
- data/hackference.co.uk/speakers/sam-wierema.html +378 -0
- data/hackference.co.uk/speakers/samathy-barratt.html +397 -0
- data/hackference.co.uk/speakers/soledad-penades.html +382 -0
- data/hackference.co.uk/speakers/terence-eden.html +388 -0
- data/hackference.co.uk/sponsors/algolia.html +350 -0
- data/hackference.co.uk/sponsors/contentful.html +350 -0
- data/hackference.co.uk/sponsors/improbable.html +351 -0
- data/hackference.co.uk/sponsors/index.html +655 -0
- data/hackference.co.uk/sponsors/microsoft.html +350 -0
- data/hackference.co.uk/sponsors/nexmo.html +350 -0
- data/hackference.co.uk/sponsors/packt.html +350 -0
- data/hackference.co.uk/sponsors/pebble.html +350 -0
- data/hackference.co.uk/sponsors/pusher.html +350 -0
- data/hackference.co.uk/sponsors/sponsor-us.html +658 -0
- data/hackference.co.uk/tickets.html +406 -0
- data/lib/elsmore/command.rb +2 -1
- data/lib/elsmore/document.rb +7 -3
- data/lib/elsmore/emitter.rb +22 -12
- data/lib/elsmore/resource.rb +4 -3
- data/lib/elsmore/rewriter.rb +13 -3
- data/lib/elsmore/scraper.rb +42 -26
- data/lib/elsmore/version.rb +1 -1
- data/lib/elsmore/writer.rb +4 -1
- metadata +74 -1
data/lib/elsmore/resource.rb
CHANGED
@@ -1,8 +1,8 @@
|
|
1
|
-
require '
|
1
|
+
require 'httparty'
|
2
2
|
|
3
3
|
module Elsmore
|
4
4
|
class Resource
|
5
|
-
attr_accessor :url, :filename
|
5
|
+
attr_accessor :url, :filename, :emitter
|
6
6
|
|
7
7
|
def initialize url, parent
|
8
8
|
self.url = Elsmore::Url.new(url, parent)
|
@@ -10,12 +10,13 @@ module Elsmore
|
|
10
10
|
|
11
11
|
def write!
|
12
12
|
writer = Elsmore::Writer.new(self)
|
13
|
+
writer.emitter = emitter
|
13
14
|
writer.write
|
14
15
|
self.filename = writer.canonical_filename
|
15
16
|
end
|
16
17
|
|
17
18
|
def data
|
18
|
-
@data ||=
|
19
|
+
@data ||= HTTParty.get(url.canonical_url)
|
19
20
|
end
|
20
21
|
end
|
21
22
|
end
|
data/lib/elsmore/rewriter.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
module Elsmore
|
2
2
|
class Rewriter
|
3
|
-
attr_accessor :resource
|
3
|
+
attr_accessor :resource, :emitter
|
4
4
|
|
5
5
|
def initialize resource
|
6
6
|
self.resource = resource
|
@@ -20,7 +20,7 @@ module Elsmore
|
|
20
20
|
end
|
21
21
|
|
22
22
|
def write_css
|
23
|
-
resource.doc.xpath('//link[rel=stylesheet]').each do |element|
|
23
|
+
resource.doc.xpath('//link[@rel="stylesheet"]').each do |element|
|
24
24
|
write_element(element, 'href')
|
25
25
|
end
|
26
26
|
end
|
@@ -41,7 +41,10 @@ module Elsmore
|
|
41
41
|
return unless element.attribute(key)
|
42
42
|
url = element.attribute(key).value
|
43
43
|
_resource = Elsmore::Resource.new(url, resource.url)
|
44
|
+
_resource.emitter = emitter
|
44
45
|
_resource.write!
|
46
|
+
|
47
|
+
emitter.log("# Rewriting #{url} => #{_resource.filename}") if url != _resource.filename
|
45
48
|
element.attribute(key).value = _resource.filename
|
46
49
|
end
|
47
50
|
|
@@ -49,8 +52,15 @@ module Elsmore
|
|
49
52
|
resource.doc.xpath('//a').each do |element|
|
50
53
|
return unless element.attribute('href')
|
51
54
|
href = element.attribute('href').value
|
55
|
+
|
52
56
|
url = Elsmore::Url.new(href, resource.url)
|
53
|
-
|
57
|
+
if url.valid
|
58
|
+
new_url = url.absolute_path_or_external_url
|
59
|
+
else
|
60
|
+
new_url = href
|
61
|
+
end
|
62
|
+
emitter.log("# Rewriting #{href} => #{new_url}") if href != new_url
|
63
|
+
element.attribute('href').value = new_url
|
54
64
|
end
|
55
65
|
end
|
56
66
|
end
|
data/lib/elsmore/scraper.rb
CHANGED
@@ -1,49 +1,65 @@
|
|
1
1
|
module Elsmore
|
2
2
|
class Scraper
|
3
|
-
attr_accessor :emitter
|
3
|
+
attr_accessor :emitter, :unprocessed, :processed, :invalid, :unprocessed_urls, :valid_domains
|
4
4
|
|
5
5
|
def initialize initial_url
|
6
6
|
seed = Elsmore::Document.new(initial_url)
|
7
7
|
|
8
|
-
|
9
|
-
@unprocessed = [seed]
|
10
|
-
@processed = []
|
11
|
-
@invalid = []
|
12
|
-
end
|
8
|
+
self.valid_domains = [seed.url.host]
|
13
9
|
|
14
|
-
|
15
|
-
|
16
|
-
document = @unprocessed.shift
|
17
|
-
next if @processed.include?(document.url.canonical_url)
|
18
|
-
emitter.dot
|
10
|
+
self.unprocessed = [seed]
|
11
|
+
self.unprocessed_urls = [seed.url.canonical_url]
|
19
12
|
|
20
|
-
|
21
|
-
|
22
|
-
|
13
|
+
self.processed = []
|
14
|
+
self.invalid = []
|
15
|
+
end
|
23
16
|
|
24
|
-
|
17
|
+
def run
|
18
|
+
while !unprocessed.empty?
|
19
|
+
document = unprocessed.shift
|
20
|
+
process document
|
25
21
|
end
|
26
22
|
|
27
23
|
{
|
28
|
-
processed:
|
29
|
-
invalid:
|
24
|
+
processed: processed,
|
25
|
+
invalid: invalid
|
30
26
|
}
|
31
27
|
end
|
32
28
|
|
33
29
|
private
|
34
30
|
|
31
|
+
def process document
|
32
|
+
emitter.log(document.url.canonical_url.colorize(:green))
|
33
|
+
|
34
|
+
document.emitter = emitter
|
35
|
+
|
36
|
+
enqueue(document.links)
|
37
|
+
document.rewrite
|
38
|
+
document.write!
|
39
|
+
|
40
|
+
processed << document.url.canonical_url
|
41
|
+
end
|
42
|
+
|
35
43
|
def enqueue links
|
36
44
|
links.each_with_index do |document, index|
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
45
|
+
next unless valid?(document)
|
46
|
+
next if !valid_domains.include?(document.url.host)
|
47
|
+
next if processed.include?(document.url.canonical_url)
|
48
|
+
next if unprocessed_urls.include?(document.url.canonical_url)
|
49
|
+
|
50
|
+
emitter.log("> Enqueued: #{document.url.canonical_url}")
|
51
|
+
|
52
|
+
unprocessed << document
|
53
|
+
unprocessed_urls << document.url.canonical_url
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def valid?(document)
|
58
|
+
if !document.url.valid && !self.invalid.include?(document.url.raw_url)
|
59
|
+
emitter.warning("> Invalid URL: #{document.url.raw_url}")
|
60
|
+
invalid << document.url.raw_url
|
46
61
|
end
|
62
|
+
document.url.valid
|
47
63
|
end
|
48
64
|
end
|
49
65
|
end
|
data/lib/elsmore/version.rb
CHANGED
data/lib/elsmore/writer.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
module Elsmore
|
2
2
|
class Writer
|
3
|
-
attr_accessor :resource
|
3
|
+
attr_accessor :resource, :emitter
|
4
4
|
|
5
5
|
def initialize resource
|
6
6
|
self.resource = resource
|
@@ -24,6 +24,9 @@ module Elsmore
|
|
24
24
|
def write_file
|
25
25
|
return if File.exist?(full_filename)
|
26
26
|
ensure_directory full_filename
|
27
|
+
|
28
|
+
emitter.log("! Saving #{full_filename}")
|
29
|
+
|
27
30
|
File.open(full_filename, 'w') do |file|
|
28
31
|
file.write(resource.data)
|
29
32
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: elsmore
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Cristiano Betta
|
@@ -111,6 +111,79 @@ files:
|
|
111
111
|
- Rakefile
|
112
112
|
- bin/elsmore
|
113
113
|
- elsmore.gemspec
|
114
|
+
- hackference.co.uk/assets/css/site.css
|
115
|
+
- hackference.co.uk/assets/css/slicknav.css
|
116
|
+
- hackference.co.uk/assets/css/tito.css
|
117
|
+
- hackference.co.uk/assets/img/algolia.png
|
118
|
+
- hackference.co.uk/assets/img/andrew-faraday.jpg
|
119
|
+
- hackference.co.uk/assets/img/conf.png
|
120
|
+
- hackference.co.uk/assets/img/contentful.jpg
|
121
|
+
- hackference.co.uk/assets/img/cristiano-betta.jpg
|
122
|
+
- hackference.co.uk/assets/img/dan-jenkins.png
|
123
|
+
- hackference.co.uk/assets/img/daniel-knell.jpg
|
124
|
+
- hackference.co.uk/assets/img/etiene-dalcol.jpg
|
125
|
+
- hackference.co.uk/assets/img/felienne-hermans.jpg
|
126
|
+
- hackference.co.uk/assets/img/hack.png
|
127
|
+
- hackference.co.uk/assets/img/hugh-rawlinson.jpg
|
128
|
+
- hackference.co.uk/assets/img/improbable.png
|
129
|
+
- hackference.co.uk/assets/img/jessica-rose-cartoon.png
|
130
|
+
- hackference.co.uk/assets/img/jonathan-kingsley.jpeg
|
131
|
+
- hackference.co.uk/assets/img/logo-small.png
|
132
|
+
- hackference.co.uk/assets/img/martin-splitt.jpg
|
133
|
+
- hackference.co.uk/assets/img/microsoft.png
|
134
|
+
- hackference.co.uk/assets/img/mozilla.png
|
135
|
+
- hackference.co.uk/assets/img/nexmo.png
|
136
|
+
- hackference.co.uk/assets/img/packt.png
|
137
|
+
- hackference.co.uk/assets/img/pebble.png
|
138
|
+
- hackference.co.uk/assets/img/proactive.png
|
139
|
+
- hackference.co.uk/assets/img/pusher.png
|
140
|
+
- hackference.co.uk/assets/img/remy-sharp.jpg
|
141
|
+
- hackference.co.uk/assets/img/sam-wierema.jpg
|
142
|
+
- hackference.co.uk/assets/img/samathy-barratt.jpg
|
143
|
+
- hackference.co.uk/assets/img/soledad.png
|
144
|
+
- hackference.co.uk/assets/img/technical-team-solutions.png
|
145
|
+
- hackference.co.uk/assets/img/terence-eden.png
|
146
|
+
- hackference.co.uk/cdn-cgi/l/email-protection/index.html
|
147
|
+
- hackference.co.uk/cdn-cgi/scripts/cf.common.js
|
148
|
+
- hackference.co.uk/cdn-cgi/scripts/zepto.min.js
|
149
|
+
- hackference.co.uk/cdn-cgi/styles/cf.errors.css
|
150
|
+
- hackference.co.uk/cdnjs.cloudflare.com/ajax/libs/SlickNav/1.0.7/jquery.slicknav.min.js
|
151
|
+
- hackference.co.uk/code-of-conduct.html
|
152
|
+
- hackference.co.uk/code.jquery.com/jquery-1.12.3.min.js
|
153
|
+
- hackference.co.uk/conference.html
|
154
|
+
- hackference.co.uk/contact.html
|
155
|
+
- hackference.co.uk/credits.html
|
156
|
+
- hackference.co.uk/hackathon.html
|
157
|
+
- hackference.co.uk/index.html
|
158
|
+
- hackference.co.uk/js.tito.io/v1/index.html
|
159
|
+
- hackference.co.uk/maxcdn.bootstrapcdn.com/bootstrap/3.3.6/css/bootstrap.min.css
|
160
|
+
- hackference.co.uk/maxcdn.bootstrapcdn.com/font-awesome/4.3.0/css/font-awesome.min.css
|
161
|
+
- hackference.co.uk/platform.twitter.com/js/tweet.00c2bb81d4f3f312a9a021c715a1c9dc.js
|
162
|
+
- hackference.co.uk/platform.twitter.com/widgets.js
|
163
|
+
- hackference.co.uk/speakers/andrew-faraday.html
|
164
|
+
- hackference.co.uk/speakers/dan-jenkins.html
|
165
|
+
- hackference.co.uk/speakers/daniel-knell.html
|
166
|
+
- hackference.co.uk/speakers/etiene-dalcol.html
|
167
|
+
- hackference.co.uk/speakers/felienne-hermans.html
|
168
|
+
- hackference.co.uk/speakers/hugh-rawlinson.html
|
169
|
+
- hackference.co.uk/speakers/jonathan-kingsley.html
|
170
|
+
- hackference.co.uk/speakers/martin-splitt.html
|
171
|
+
- hackference.co.uk/speakers/remy-sharp.html
|
172
|
+
- hackference.co.uk/speakers/sam-wierema.html
|
173
|
+
- hackference.co.uk/speakers/samathy-barratt.html
|
174
|
+
- hackference.co.uk/speakers/soledad-penades.html
|
175
|
+
- hackference.co.uk/speakers/terence-eden.html
|
176
|
+
- hackference.co.uk/sponsors/algolia.html
|
177
|
+
- hackference.co.uk/sponsors/contentful.html
|
178
|
+
- hackference.co.uk/sponsors/improbable.html
|
179
|
+
- hackference.co.uk/sponsors/index.html
|
180
|
+
- hackference.co.uk/sponsors/microsoft.html
|
181
|
+
- hackference.co.uk/sponsors/nexmo.html
|
182
|
+
- hackference.co.uk/sponsors/packt.html
|
183
|
+
- hackference.co.uk/sponsors/pebble.html
|
184
|
+
- hackference.co.uk/sponsors/pusher.html
|
185
|
+
- hackference.co.uk/sponsors/sponsor-us.html
|
186
|
+
- hackference.co.uk/tickets.html
|
114
187
|
- lib/elsmore.rb
|
115
188
|
- lib/elsmore/command.rb
|
116
189
|
- lib/elsmore/document.rb
|