outrider 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,224 @@
1
+ require 'nokogiri'
2
+ require 'active_record'
3
+ require 'active_support/core_ext/module/attribute_accessors'
4
+ require 'active_support/inflector'
5
+ require 'open-uri'
6
+ require 'uri'
7
+ require 'set'
8
+ require 'json'
9
+ require 'yaml'
10
+ require 'logger'
11
+ require 'time'
12
+ require 'trollop'
13
+ require_relative 'intel'
14
+
15
+
16
+ module OutriderTools
17
+
18
+
19
+ module Crawl
20
+
21
+
22
+
23
+
24
+ def self.site project, operate
25
+
26
+ @log = Logger.new(STDOUT)
27
+
28
+ recurse = ->() do
29
+ #
30
+ # Pick a from the database to crawl
31
+ unless ProjectData.where( status: 'unscraped', project_id: project[:id] ).exists?
32
+ @log.info "No pages to scrape"
33
+ return false
34
+ end
35
+
36
+ working_page = ProjectData.where( status: 'unscraped', project_id: project[:id]).first
37
+ working_page.status = 'processing'
38
+ working_page.save
39
+
40
+ @log.info "Scraping #{working_page.url}"
41
+ # Scape it
42
+ data, links = OutriderTools::Scrape::page( working_page.url, operate)
43
+
44
+ # save links
45
+ OutriderTools::Link::save_many(links, project, @log )
46
+
47
+ @log.info "Saving page data for url #{working_page.url}"
48
+ @log.info data[:status]
49
+ working_page.update( data ) unless data.nil?
50
+
51
+ return true
52
+ end
53
+
54
+ crawl = true
55
+ while crawl
56
+ crawl = recurse.call
57
+ end
58
+
59
+ end
60
+
61
+
62
+
63
+
64
+
65
+
66
+
67
+ end
68
+
69
+ module Scrape
70
+
71
+
72
+
73
+
74
+
75
+ def self.page( url, operate )
76
+ @log = Logger.new('log/logfile.log', 'daily')
77
+ files = OutriderTools::Clean::file_types
78
+ begin
79
+ page_uri = URI.parse( url )
80
+ doc = Nokogiri.HTML( open(page_uri) )
81
+ # Yield page and URI to the block passed in
82
+ data = operate.( doc, page_uri )
83
+
84
+ # Find all the links on the page
85
+ hrefs = doc.css('a[href]').map{ |a| a['href'] }
86
+
87
+ clean_uris = OutriderTools::Clean::tidy_urls( hrefs, page_uri, page_uri, files )
88
+ return data, clean_uris
89
+
90
+ rescue OpenURI::HTTPError # Guard against 404s
91
+ @log.error "Skipping invalid link #{page_uri}"
92
+ rescue ArgumentError => e
93
+ @log.error "Skipping page that causes argument error: #{e}"
94
+ rescue RuntimeError => e
95
+ @log.error "Invalid Redirection: #{e}"
96
+ rescue Exception => e
97
+ @log.error "Error #{e}"
98
+ raise e
99
+ end
100
+
101
+ return { :status => 'rejected' }
102
+
103
+ end
104
+
105
+ end
106
+
107
+
108
+
109
+ module Link
110
+
111
+ def self.save_many( links, project, log )
112
+
113
+ unless links.nil?
114
+ links.each do |link|
115
+ # Check if link already exists
116
+ #if ProjectData.find_by(url: link.to_s).nil?
117
+ unless ProjectData.where( url: link.to_s, project_id: project[:id] ).exists?
118
+ ProjectData.create({
119
+ :url => link.to_s,
120
+ :status => 'unscraped',
121
+ :project_id => project[:id]
122
+ })
123
+ log.info "Adding new url to database: #{link.to_s}"
124
+ else
125
+ log.info "URL already exists in database: #{link.to_s}"
126
+ end
127
+ end
128
+ end
129
+
130
+
131
+ end
132
+
133
+ end
134
+
135
+
136
+
137
+
138
+ module Clean
139
+
140
+
141
+ def self.tidy_urls hrefs, page_uri, domain, files
142
+ # Make these URIs, throwing out problem ones like mailto:
143
+ uris = hrefs.map{ |href| URI.join( page_uri, href ) rescue nil }.compact
144
+
145
+ # Pare it down to only those pages that are on the same site
146
+ uris.select!{ |uri| uri.host == domain.host }
147
+
148
+ # Throw out links to files (this could be more efficient with regex)
149
+ uris.reject!{ |uri| files.any?{ |ext| uri.path.end_with?(".#{ext}") } }
150
+
151
+ # Throw out duplicates
152
+ uris.reject!{ |uri| ProjectData.exists?( url: uri.to_s) }
153
+
154
+ # Remove #foo fragments so that sub-page links aren't differentiated
155
+ uris.each{ |uri| uri.fragment = nil }
156
+
157
+ return uris
158
+
159
+ end
160
+
161
+
162
+
163
+
164
+
165
+
166
+
167
+ def self.file_types sub = :all
168
+ case sub
169
+ when :all
170
+ return %w[png jpeg jpg gif svg txt js css zip gz pdf]
171
+ when :images
172
+ return %w[png jpeg jpg gif svg]
173
+ when :pdfs
174
+ return %w[pdf]
175
+ else
176
+ return %w[png jpeg jpg gif svg txt js css zip gz pdf]
177
+ end
178
+ end
179
+
180
+
181
+
182
+
183
+ # takes string of words, sorts out duds and returns array
184
+ def self.process_words_to_array words = ""
185
+ clean_words = words.split.each do |word|
186
+ word.downcase!
187
+ end
188
+ end
189
+
190
+
191
+
192
+ # takes array of strings and combines them
193
+ def self.word_array_to_string strings
194
+ the_string = ''
195
+ strings.each do |string|
196
+ the_string += string.gsub(/[^a-z0-9\s]/i, '')
197
+ end
198
+ return the_string
199
+ end
200
+
201
+ end
202
+
203
+
204
+
205
+
206
+ module Store
207
+
208
+ def self.get_filepath base, filename
209
+ File.expand_path(File.join(File.dirname(base), filename ))
210
+ end
211
+
212
+ end
213
+
214
+
215
+
216
+
217
+
218
+
219
+
220
+ end
221
+
222
+
223
+
224
+
@@ -0,0 +1,3 @@
1
+ module Outrider
2
+ VERSION = "0.0.1"
3
+ end
data/outrider.gemspec ADDED
@@ -0,0 +1,27 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = "outrider"
7
+ spec.version = "0.0.1"
8
+ spec.authors = ["Jaap Badlands"]
9
+ spec.email = ["jaap@deadlysyntax.com"]
10
+
11
+ spec.summary = %q{Outrider Web Automation Framework provides structure and tools for writing web-automation tasks}
12
+ spec.description = %q{Outrider's purpose is to provide an easy-to-use programming interface and organisational structure, to create and run tasks that can automatically visit, interact with and test websites and also that process, clean and store data, and tools for statistical analysis. }
13
+ spec.homepage = "https://github.com/deadlysyntax/outrider"
14
+
15
+ spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
16
+ spec.bindir = "bin"
17
+ spec.executables = ['outrider']
18
+ spec.require_paths = ["lib"]
19
+
20
+ if spec.respond_to?(:metadata)
21
+ #spec.metadata['allowed_push_host'] = "TODO: Set to 'http://mygemserver.com' to prevent pushes to rubygems.org, or delete to allow pushes to any server."
22
+ end
23
+
24
+ spec.add_development_dependency "bundler", "~> 1.9"
25
+ spec.add_development_dependency "rspec"
26
+ spec.add_development_dependency "rake", "~> 10.0"
27
+ end
@@ -0,0 +1,56 @@
1
+ class NzHerald < Project
2
+
3
+
4
+ def initialize
5
+ project_name :nz_herald
6
+ @log = Logger.new(STDOUT)
7
+ end
8
+
9
+
10
+
11
+
12
+ def crawl options
13
+ OutriderTools::Crawl::site( @config, ->(page, uri){
14
+ unless( page.css('.articleTitle').text.strip.empty? )
15
+ clean_date = DateTime.strptime(page.css('.storyDate').text.strip, '%a %b %d %H:%M:%S %Z %Y').to_s #Tue Mar 03 08:27:23 UTC 2015
16
+ return {
17
+ :title_raw => page.css('.articleTitle').text.strip,
18
+ :author => page.css('.authorName a').text.strip,
19
+ :content_raw => page.css('#articleBody p').map{ |paragraph| paragraph.text.strip }.to_json,
20
+ :date_published_raw => page.css('.storyDate').text.strip,
21
+ :date_published_timestamp => clean_date,
22
+ :status => 'scraped'
23
+ }
24
+ else
25
+ return {
26
+ :status => 'rejected'
27
+ }
28
+ end
29
+ })
30
+ end
31
+
32
+
33
+
34
+ def intel options
35
+
36
+ raw_articles = ProjectData.where( "project_id = ? AND content_raw IS NOT NULL", @config[:id] ).limit(10000)
37
+
38
+ paragraphs = ''
39
+ words = []
40
+
41
+ raw_articles.each do |article|
42
+ paragraphs += OutriderTools::Clean::word_array_to_string( JSON.parse( article.content_raw ) ) + ' '
43
+ end
44
+
45
+ words = OutriderIntel::word_frequency( OutriderTools::Clean::process_words_to_array( paragraphs ) )
46
+ words = words.sort_by { |word, frequency| frequency }
47
+
48
+ p words
49
+
50
+ end
51
+
52
+
53
+
54
+ end
55
+
56
+
@@ -0,0 +1,71 @@
1
+ class Stuff < Project
2
+
3
+
4
+
5
+ def initialize
6
+
7
+ project_name :stuff
8
+
9
+ end
10
+
11
+
12
+
13
+ def crawl options
14
+ OutriderTools::Crawl::site( @config, ->(page, uri){
15
+ unless( page.css('.story_landing').text.strip.empty? )
16
+ #p page.css('.story_landing .story__dateline span').empty?
17
+ unless page.css('.story_landing .story__dateline span').empty?
18
+ clean_date = DateTime.strptime( page.css('.story_landing .story__dateline span')[0]["content"], '%a %b %d %H:%M:%S %Z %Y').to_s
19
+ else
20
+ return { :status => 'rejected' }
21
+ end
22
+
23
+ return {
24
+ :url => uri.to_s,
25
+ :title_raw => page.css('.story_content_top h1').text.strip,
26
+ :author => page.css('.story__byline span[itemprop="name"]').text.strip,
27
+ :content_raw => page.css('.story_landing > p').map{ |paragraph| paragraph.text.strip }.to_json,
28
+ :date_published_raw => page.css('.story_landing .story__dateline span')[0]["content"],
29
+ :date_published_timestamp => clean_date,
30
+ :status => 'scraped'
31
+ }
32
+ else
33
+ return { :status => 'rejected' }
34
+ end
35
+ })
36
+ end
37
+
38
+
39
+
40
+ def intel options
41
+
42
+ raw_articles = ProjectData.where( "project_id = ? AND content_raw IS NOT NULL", @config[:id] ).limit(10000)
43
+
44
+ paragraphs = ''
45
+ words = []
46
+
47
+ raw_articles.each do |article|
48
+ paragraphs += OutriderTools::Clean::word_array_to_string( JSON.parse( article.content_raw ) ) + ' '
49
+ end
50
+
51
+ words = OutriderIntel::word_frequency( OutriderTools::Clean::process_words_to_array( paragraphs ) )
52
+ words = words.sort_by { |word, frequency| frequency }
53
+
54
+ p words
55
+
56
+ end
57
+
58
+
59
+
60
+
61
+
62
+
63
+
64
+
65
+
66
+
67
+
68
+
69
+
70
+
71
+ end
@@ -0,0 +1,63 @@
1
+ class TestProject < Project
2
+
3
+
4
+
5
+ def initialize
6
+
7
+ project_name :test_project
8
+
9
+ end
10
+
11
+
12
+
13
+
14
+ def test_method_true options
15
+ return options
16
+ end
17
+
18
+ def test_method_false options
19
+ return false
20
+ end
21
+
22
+
23
+
24
+
25
+ #
26
+ # options are passed through from the command line
27
+ #
28
+ def crawl options
29
+
30
+ OutriderTools::Crawl::site( @config, ->(page, uri){
31
+ unless( page.css('.story_landing').text.strip.empty? )
32
+ #p page.css('.story_landing .story__dateline span').empty?
33
+ unless page.css('.story_landing .story__dateline span').empty?
34
+ clean_date = DateTime.strptime( page.css('.story_landing .story__dateline span')[0]["content"], '%a %b %d %H:%M:%S %Z %Y').to_s
35
+ else
36
+ return { :status => 'rejected' }
37
+ end
38
+
39
+ return {
40
+ :url => uri.to_s,
41
+ :title_raw => page.css('.story_content_top h1').text.strip,
42
+ :author => page.css('.story__byline span[itemprop="name"]').text.strip,
43
+ :content_raw => page.css('.story_landing > p').map{ |paragraph| paragraph.text.strip }.to_json,
44
+ :date_published_raw => page.css('.story_landing .story__dateline span')[0]["content"],
45
+ :date_published_timestamp => clean_date,
46
+ :status => 'scraped'
47
+ }
48
+ else
49
+ return { :status => 'rejected' }
50
+ end
51
+ })
52
+
53
+ end
54
+
55
+
56
+
57
+
58
+
59
+
60
+
61
+
62
+
63
+ end