outrider 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +17 -0
- data/.rspec +2 -0
- data/.ruby-version +1 -0
- data/.travis.yml +3 -0
- data/Capfile +30 -0
- data/Gemfile +20 -0
- data/Gemfile.lock +119 -0
- data/LICENSE.txt +21 -0
- data/README.md +261 -0
- data/Rakefile +52 -0
- data/app/run.rb +15 -0
- data/bin/console +14 -0
- data/bin/outrider +8 -0
- data/bin/setup +7 -0
- data/config.ru +2 -0
- data/config/messages.yml +1 -0
- data/config/schema.sql +40 -0
- data/lib/ignite.rb +8 -0
- data/lib/outrider.rb +94 -0
- data/lib/outrider/commandify.rb +49 -0
- data/lib/outrider/engine.rb +20 -0
- data/lib/outrider/intel.rb +14 -0
- data/lib/outrider/project.rb +146 -0
- data/lib/outrider/tools.rb +224 -0
- data/lib/outrider/version.rb +3 -0
- data/outrider.gemspec +27 -0
- data/projects/nz_herald/auxiliary.rb +56 -0
- data/projects/stuff/auxiliary.rb +71 -0
- data/projects/test_project/auxiliary.rb +63 -0
- data/projects/theage/auxiliary.rb +29 -0
- data/public/index.html +0 -0
- data/tmp/x.txt +1 -0
- metadata +122 -0
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
require 'nokogiri'
|
|
2
|
+
require 'active_record'
|
|
3
|
+
require 'active_support/core_ext/module/attribute_accessors'
|
|
4
|
+
require 'active_support/inflector'
|
|
5
|
+
require 'open-uri'
|
|
6
|
+
require 'uri'
|
|
7
|
+
require 'set'
|
|
8
|
+
require 'json'
|
|
9
|
+
require 'yaml'
|
|
10
|
+
require 'logger'
|
|
11
|
+
require 'time'
|
|
12
|
+
require 'trollop'
|
|
13
|
+
require_relative 'intel'
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
module OutriderTools
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
module Crawl
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def self.site project, operate
|
|
25
|
+
|
|
26
|
+
@log = Logger.new(STDOUT)
|
|
27
|
+
|
|
28
|
+
recurse = ->() do
|
|
29
|
+
#
|
|
30
|
+
# Pick a from the database to crawl
|
|
31
|
+
unless ProjectData.where( status: 'unscraped', project_id: project[:id] ).exists?
|
|
32
|
+
@log.info "No pages to scrape"
|
|
33
|
+
return false
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
working_page = ProjectData.where( status: 'unscraped', project_id: project[:id]).first
|
|
37
|
+
working_page.status = 'processing'
|
|
38
|
+
working_page.save
|
|
39
|
+
|
|
40
|
+
@log.info "Scraping #{working_page.url}"
|
|
41
|
+
# Scape it
|
|
42
|
+
data, links = OutriderTools::Scrape::page( working_page.url, operate)
|
|
43
|
+
|
|
44
|
+
# save links
|
|
45
|
+
OutriderTools::Link::save_many(links, project, @log )
|
|
46
|
+
|
|
47
|
+
@log.info "Saving page data for url #{working_page.url}"
|
|
48
|
+
@log.info data[:status]
|
|
49
|
+
working_page.update( data ) unless data.nil?
|
|
50
|
+
|
|
51
|
+
return true
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
crawl = true
|
|
55
|
+
while crawl
|
|
56
|
+
crawl = recurse.call
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
module Scrape
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def self.page( url, operate )
|
|
76
|
+
@log = Logger.new('log/logfile.log', 'daily')
|
|
77
|
+
files = OutriderTools::Clean::file_types
|
|
78
|
+
begin
|
|
79
|
+
page_uri = URI.parse( url )
|
|
80
|
+
doc = Nokogiri.HTML( open(page_uri) )
|
|
81
|
+
# Yield page and URI to the block passed in
|
|
82
|
+
data = operate.( doc, page_uri )
|
|
83
|
+
|
|
84
|
+
# Find all the links on the page
|
|
85
|
+
hrefs = doc.css('a[href]').map{ |a| a['href'] }
|
|
86
|
+
|
|
87
|
+
clean_uris = OutriderTools::Clean::tidy_urls( hrefs, page_uri, page_uri, files )
|
|
88
|
+
return data, clean_uris
|
|
89
|
+
|
|
90
|
+
rescue OpenURI::HTTPError # Guard against 404s
|
|
91
|
+
@log.error "Skipping invalid link #{page_uri}"
|
|
92
|
+
rescue ArgumentError => e
|
|
93
|
+
@log.error "Skipping page that causes argument error: #{e}"
|
|
94
|
+
rescue RuntimeError => e
|
|
95
|
+
@log.error "Invalid Redirection: #{e}"
|
|
96
|
+
rescue Exception => e
|
|
97
|
+
@log.error "Error #{e}"
|
|
98
|
+
raise e
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
return { :status => 'rejected' }
|
|
102
|
+
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
module Link
|
|
110
|
+
|
|
111
|
+
def self.save_many( links, project, log )
|
|
112
|
+
|
|
113
|
+
unless links.nil?
|
|
114
|
+
links.each do |link|
|
|
115
|
+
# Check if link already exists
|
|
116
|
+
#if ProjectData.find_by(url: link.to_s).nil?
|
|
117
|
+
unless ProjectData.where( url: link.to_s, project_id: project[:id] ).exists?
|
|
118
|
+
ProjectData.create({
|
|
119
|
+
:url => link.to_s,
|
|
120
|
+
:status => 'unscraped',
|
|
121
|
+
:project_id => project[:id]
|
|
122
|
+
})
|
|
123
|
+
log.info "Adding new url to database: #{link.to_s}"
|
|
124
|
+
else
|
|
125
|
+
log.info "URL already exists in database: #{link.to_s}"
|
|
126
|
+
end
|
|
127
|
+
end
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
module Clean
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def self.tidy_urls hrefs, page_uri, domain, files
|
|
142
|
+
# Make these URIs, throwing out problem ones like mailto:
|
|
143
|
+
uris = hrefs.map{ |href| URI.join( page_uri, href ) rescue nil }.compact
|
|
144
|
+
|
|
145
|
+
# Pare it down to only those pages that are on the same site
|
|
146
|
+
uris.select!{ |uri| uri.host == domain.host }
|
|
147
|
+
|
|
148
|
+
# Throw out links to files (this could be more efficient with regex)
|
|
149
|
+
uris.reject!{ |uri| files.any?{ |ext| uri.path.end_with?(".#{ext}") } }
|
|
150
|
+
|
|
151
|
+
# Throw out duplicates
|
|
152
|
+
uris.reject!{ |uri| ProjectData.exists?( url: uri.to_s) }
|
|
153
|
+
|
|
154
|
+
# Remove #foo fragments so that sub-page links aren't differentiated
|
|
155
|
+
uris.each{ |uri| uri.fragment = nil }
|
|
156
|
+
|
|
157
|
+
return uris
|
|
158
|
+
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def self.file_types sub = :all
|
|
168
|
+
case sub
|
|
169
|
+
when :all
|
|
170
|
+
return %w[png jpeg jpg gif svg txt js css zip gz pdf]
|
|
171
|
+
when :images
|
|
172
|
+
return %w[png jpeg jpg gif svg]
|
|
173
|
+
when :pdfs
|
|
174
|
+
return %w[pdf]
|
|
175
|
+
else
|
|
176
|
+
return %w[png jpeg jpg gif svg txt js css zip gz pdf]
|
|
177
|
+
end
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
# takes string of words, sorts out duds and returns array
|
|
184
|
+
def self.process_words_to_array words = ""
|
|
185
|
+
clean_words = words.split.each do |word|
|
|
186
|
+
word.downcase!
|
|
187
|
+
end
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
# takes array of strings and combines them
|
|
193
|
+
def self.word_array_to_string strings
|
|
194
|
+
the_string = ''
|
|
195
|
+
strings.each do |string|
|
|
196
|
+
the_string += string.gsub(/[^a-z0-9\s]/i, '')
|
|
197
|
+
end
|
|
198
|
+
return the_string
|
|
199
|
+
end
|
|
200
|
+
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
module Store
|
|
207
|
+
|
|
208
|
+
def self.get_filepath base, filename
|
|
209
|
+
File.expand_path(File.join(File.dirname(base), filename ))
|
|
210
|
+
end
|
|
211
|
+
|
|
212
|
+
end
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
end
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
|
data/outrider.gemspec
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
# coding: utf-8
|
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
|
4
|
+
|
|
5
|
+
Gem::Specification.new do |spec|
|
|
6
|
+
spec.name = "outrider"
|
|
7
|
+
spec.version = "0.0.1"
|
|
8
|
+
spec.authors = ["Jaap Badlands"]
|
|
9
|
+
spec.email = ["jaap@deadlysyntax.com"]
|
|
10
|
+
|
|
11
|
+
spec.summary = %q{Outrider Web Automation Framework provides structure and tools for writing web-automation tasks}
|
|
12
|
+
spec.description = %q{Outrider's purpose is to provide an easy-to-use programming interface and organisational structure, to create and run tasks that can automatically visit, interact with and test websites and also that process, clean and store data, and tools for statistical analysis. }
|
|
13
|
+
spec.homepage = "https://github.com/deadlysyntax/outrider"
|
|
14
|
+
|
|
15
|
+
spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
|
16
|
+
spec.bindir = "bin"
|
|
17
|
+
spec.executables = ['outrider']
|
|
18
|
+
spec.require_paths = ["lib"]
|
|
19
|
+
|
|
20
|
+
if spec.respond_to?(:metadata)
|
|
21
|
+
#spec.metadata['allowed_push_host'] = "TODO: Set to 'http://mygemserver.com' to prevent pushes to rubygems.org, or delete to allow pushes to any server."
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
spec.add_development_dependency "bundler", "~> 1.9"
|
|
25
|
+
spec.add_development_dependency "rspec"
|
|
26
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
|
27
|
+
end
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
class NzHerald < Project
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def initialize
|
|
5
|
+
project_name :nz_herald
|
|
6
|
+
@log = Logger.new(STDOUT)
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def crawl options
|
|
13
|
+
OutriderTools::Crawl::site( @config, ->(page, uri){
|
|
14
|
+
unless( page.css('.articleTitle').text.strip.empty? )
|
|
15
|
+
clean_date = DateTime.strptime(page.css('.storyDate').text.strip, '%a %b %d %H:%M:%S %Z %Y').to_s #Tue Mar 03 08:27:23 UTC 2015
|
|
16
|
+
return {
|
|
17
|
+
:title_raw => page.css('.articleTitle').text.strip,
|
|
18
|
+
:author => page.css('.authorName a').text.strip,
|
|
19
|
+
:content_raw => page.css('#articleBody p').map{ |paragraph| paragraph.text.strip }.to_json,
|
|
20
|
+
:date_published_raw => page.css('.storyDate').text.strip,
|
|
21
|
+
:date_published_timestamp => clean_date,
|
|
22
|
+
:status => 'scraped'
|
|
23
|
+
}
|
|
24
|
+
else
|
|
25
|
+
return {
|
|
26
|
+
:status => 'rejected'
|
|
27
|
+
}
|
|
28
|
+
end
|
|
29
|
+
})
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def intel options
|
|
35
|
+
|
|
36
|
+
raw_articles = ProjectData.where( "project_id = ? AND content_raw IS NOT NULL", @config[:id] ).limit(10000)
|
|
37
|
+
|
|
38
|
+
paragraphs = ''
|
|
39
|
+
words = []
|
|
40
|
+
|
|
41
|
+
raw_articles.each do |article|
|
|
42
|
+
paragraphs += OutriderTools::Clean::word_array_to_string( JSON.parse( article.content_raw ) ) + ' '
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
words = OutriderIntel::word_frequency( OutriderTools::Clean::process_words_to_array( paragraphs ) )
|
|
46
|
+
words = words.sort_by { |word, frequency| frequency }
|
|
47
|
+
|
|
48
|
+
p words
|
|
49
|
+
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
class Stuff < Project
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def initialize
|
|
6
|
+
|
|
7
|
+
project_name :stuff
|
|
8
|
+
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def crawl options
|
|
14
|
+
OutriderTools::Crawl::site( @config, ->(page, uri){
|
|
15
|
+
unless( page.css('.story_landing').text.strip.empty? )
|
|
16
|
+
#p page.css('.story_landing .story__dateline span').empty?
|
|
17
|
+
unless page.css('.story_landing .story__dateline span').empty?
|
|
18
|
+
clean_date = DateTime.strptime( page.css('.story_landing .story__dateline span')[0]["content"], '%a %b %d %H:%M:%S %Z %Y').to_s
|
|
19
|
+
else
|
|
20
|
+
return { :status => 'rejected' }
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
return {
|
|
24
|
+
:url => uri.to_s,
|
|
25
|
+
:title_raw => page.css('.story_content_top h1').text.strip,
|
|
26
|
+
:author => page.css('.story__byline span[itemprop="name"]').text.strip,
|
|
27
|
+
:content_raw => page.css('.story_landing > p').map{ |paragraph| paragraph.text.strip }.to_json,
|
|
28
|
+
:date_published_raw => page.css('.story_landing .story__dateline span')[0]["content"],
|
|
29
|
+
:date_published_timestamp => clean_date,
|
|
30
|
+
:status => 'scraped'
|
|
31
|
+
}
|
|
32
|
+
else
|
|
33
|
+
return { :status => 'rejected' }
|
|
34
|
+
end
|
|
35
|
+
})
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def intel options
|
|
41
|
+
|
|
42
|
+
raw_articles = ProjectData.where( "project_id = ? AND content_raw IS NOT NULL", @config[:id] ).limit(10000)
|
|
43
|
+
|
|
44
|
+
paragraphs = ''
|
|
45
|
+
words = []
|
|
46
|
+
|
|
47
|
+
raw_articles.each do |article|
|
|
48
|
+
paragraphs += OutriderTools::Clean::word_array_to_string( JSON.parse( article.content_raw ) ) + ' '
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
words = OutriderIntel::word_frequency( OutriderTools::Clean::process_words_to_array( paragraphs ) )
|
|
52
|
+
words = words.sort_by { |word, frequency| frequency }
|
|
53
|
+
|
|
54
|
+
p words
|
|
55
|
+
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
end
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
class TestProject < Project
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def initialize
|
|
6
|
+
|
|
7
|
+
project_name :test_project
|
|
8
|
+
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def test_method_true options
|
|
15
|
+
return options
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def test_method_false options
|
|
19
|
+
return false
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
#
|
|
26
|
+
# options are passed through from the command line
|
|
27
|
+
#
|
|
28
|
+
def crawl options
|
|
29
|
+
|
|
30
|
+
OutriderTools::Crawl::site( @config, ->(page, uri){
|
|
31
|
+
unless( page.css('.story_landing').text.strip.empty? )
|
|
32
|
+
#p page.css('.story_landing .story__dateline span').empty?
|
|
33
|
+
unless page.css('.story_landing .story__dateline span').empty?
|
|
34
|
+
clean_date = DateTime.strptime( page.css('.story_landing .story__dateline span')[0]["content"], '%a %b %d %H:%M:%S %Z %Y').to_s
|
|
35
|
+
else
|
|
36
|
+
return { :status => 'rejected' }
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
return {
|
|
40
|
+
:url => uri.to_s,
|
|
41
|
+
:title_raw => page.css('.story_content_top h1').text.strip,
|
|
42
|
+
:author => page.css('.story__byline span[itemprop="name"]').text.strip,
|
|
43
|
+
:content_raw => page.css('.story_landing > p').map{ |paragraph| paragraph.text.strip }.to_json,
|
|
44
|
+
:date_published_raw => page.css('.story_landing .story__dateline span')[0]["content"],
|
|
45
|
+
:date_published_timestamp => clean_date,
|
|
46
|
+
:status => 'scraped'
|
|
47
|
+
}
|
|
48
|
+
else
|
|
49
|
+
return { :status => 'rejected' }
|
|
50
|
+
end
|
|
51
|
+
})
|
|
52
|
+
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
end
|