outrider 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +17 -0
- data/.rspec +2 -0
- data/.ruby-version +1 -0
- data/.travis.yml +3 -0
- data/Capfile +30 -0
- data/Gemfile +20 -0
- data/Gemfile.lock +119 -0
- data/LICENSE.txt +21 -0
- data/README.md +261 -0
- data/Rakefile +52 -0
- data/app/run.rb +15 -0
- data/bin/console +14 -0
- data/bin/outrider +8 -0
- data/bin/setup +7 -0
- data/config.ru +2 -0
- data/config/messages.yml +1 -0
- data/config/schema.sql +40 -0
- data/lib/ignite.rb +8 -0
- data/lib/outrider.rb +94 -0
- data/lib/outrider/commandify.rb +49 -0
- data/lib/outrider/engine.rb +20 -0
- data/lib/outrider/intel.rb +14 -0
- data/lib/outrider/project.rb +146 -0
- data/lib/outrider/tools.rb +224 -0
- data/lib/outrider/version.rb +3 -0
- data/outrider.gemspec +27 -0
- data/projects/nz_herald/auxiliary.rb +56 -0
- data/projects/stuff/auxiliary.rb +71 -0
- data/projects/test_project/auxiliary.rb +63 -0
- data/projects/theage/auxiliary.rb +29 -0
- data/public/index.html +0 -0
- data/tmp/x.txt +1 -0
- metadata +122 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA1:
|
|
3
|
+
metadata.gz: 41830dc2f20f2dca04892fa4a93db30a9a12286e
|
|
4
|
+
data.tar.gz: f8daadaa7e1bd18aecc62cc61b1650a18db96012
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: 2058dddd555c90772157e5ee5c37f13d3cc93677aba9c2376a8d1526ddba06e6af58f5a031b807235ea9183edee53cb9da8df97e403ae769831e2a2c65d59a65
|
|
7
|
+
data.tar.gz: 08439e7eb5656870da32361b22372979be7a540528f13187a14826611da593ecf863073ad673885ab9bdbc5915b9ad6b3056890c28beafa8e57bae97075e99f0
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/.ruby-version
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
2.2.1
|
data/.travis.yml
ADDED
data/Capfile
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# Load DSL and set up stages
|
|
2
|
+
require 'capistrano/setup'
|
|
3
|
+
|
|
4
|
+
# Include default deployment tasks
|
|
5
|
+
require 'capistrano/deploy'
|
|
6
|
+
|
|
7
|
+
# Include tasks from other gems included in your Gemfile
|
|
8
|
+
#
|
|
9
|
+
# For documentation on these, see for example:
|
|
10
|
+
#
|
|
11
|
+
# https://github.com/capistrano/rvm
|
|
12
|
+
# https://github.com/capistrano/rbenv
|
|
13
|
+
# https://github.com/capistrano/chruby
|
|
14
|
+
# https://github.com/capistrano/bundler
|
|
15
|
+
# https://github.com/capistrano/rails
|
|
16
|
+
# https://github.com/capistrano/passenger
|
|
17
|
+
#
|
|
18
|
+
# require 'capistrano/rvm'
|
|
19
|
+
# require 'capistrano/rbenv'
|
|
20
|
+
# require 'capistrano/chruby'
|
|
21
|
+
# require 'capistrano/bundler'
|
|
22
|
+
# require 'capistrano/rails/assets'
|
|
23
|
+
# require 'capistrano/rails/migrations'
|
|
24
|
+
# require 'capistrano/passenger'
|
|
25
|
+
# require 'capistrano/ssh_doctor'
|
|
26
|
+
require 'rvm1/capistrano3'
|
|
27
|
+
require 'capistrano/bundler'
|
|
28
|
+
|
|
29
|
+
# Load custom tasks from `lib/capistrano/tasks` if you have any defined
|
|
30
|
+
Dir.glob('lib/capistrano/tasks/*.rake').each { |r| import r }
|
data/Gemfile
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
source 'https://rubygems.org'
|
|
2
|
+
|
|
3
|
+
# Specify your gem's dependencies in outrider_ruby.gemspec
|
|
4
|
+
gemspec
|
|
5
|
+
|
|
6
|
+
gem 'trollop', '~> 2.1.2'
|
|
7
|
+
gem 'nokogiri', '~> 1.6.6.2'
|
|
8
|
+
gem 'mechanize', '~> 2.7.3'
|
|
9
|
+
gem 'activesupport', '~> 4.2.1'
|
|
10
|
+
gem 'i18n', '~> 0.7.0'
|
|
11
|
+
gem 'activerecord', '~> 4.2.1'
|
|
12
|
+
gem 'mysql2', '~> 0.3.18'
|
|
13
|
+
gem 'rspec', '~> 3.2.0'
|
|
14
|
+
gem 'capistrano', '~> 3.4.0'
|
|
15
|
+
gem 'rvm1-capistrano3', '~> 1.3.2.2'
|
|
16
|
+
gem 'capistrano-bundler', '~> 1.1.4'
|
|
17
|
+
gem 'rack', '~> 1.6.0'
|
|
18
|
+
gem 'sinatra', '~> 1.4.6'
|
|
19
|
+
gem 'facets', '~> 3.0.0'
|
|
20
|
+
|
data/Gemfile.lock
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
PATH
|
|
2
|
+
remote: .
|
|
3
|
+
specs:
|
|
4
|
+
outrider (0.0.1)
|
|
5
|
+
|
|
6
|
+
GEM
|
|
7
|
+
remote: https://rubygems.org/
|
|
8
|
+
specs:
|
|
9
|
+
activemodel (4.2.1)
|
|
10
|
+
activesupport (= 4.2.1)
|
|
11
|
+
builder (~> 3.1)
|
|
12
|
+
activerecord (4.2.1)
|
|
13
|
+
activemodel (= 4.2.1)
|
|
14
|
+
activesupport (= 4.2.1)
|
|
15
|
+
arel (~> 6.0)
|
|
16
|
+
activesupport (4.2.1)
|
|
17
|
+
i18n (~> 0.7)
|
|
18
|
+
json (~> 1.7, >= 1.7.7)
|
|
19
|
+
minitest (~> 5.1)
|
|
20
|
+
thread_safe (~> 0.3, >= 0.3.4)
|
|
21
|
+
tzinfo (~> 1.1)
|
|
22
|
+
arel (6.0.0)
|
|
23
|
+
builder (3.2.2)
|
|
24
|
+
capistrano (3.4.0)
|
|
25
|
+
i18n
|
|
26
|
+
rake (>= 10.0.0)
|
|
27
|
+
sshkit (~> 1.3)
|
|
28
|
+
capistrano-bundler (1.1.4)
|
|
29
|
+
capistrano (~> 3.1)
|
|
30
|
+
sshkit (~> 1.2)
|
|
31
|
+
colorize (0.7.5)
|
|
32
|
+
diff-lcs (1.2.5)
|
|
33
|
+
domain_name (0.5.24)
|
|
34
|
+
unf (>= 0.0.5, < 1.0.0)
|
|
35
|
+
facets (3.0.0)
|
|
36
|
+
http-cookie (1.0.2)
|
|
37
|
+
domain_name (~> 0.5)
|
|
38
|
+
i18n (0.7.0)
|
|
39
|
+
json (1.8.2)
|
|
40
|
+
mechanize (2.7.3)
|
|
41
|
+
domain_name (~> 0.5, >= 0.5.1)
|
|
42
|
+
http-cookie (~> 1.0)
|
|
43
|
+
mime-types (~> 2.0)
|
|
44
|
+
net-http-digest_auth (~> 1.1, >= 1.1.1)
|
|
45
|
+
net-http-persistent (~> 2.5, >= 2.5.2)
|
|
46
|
+
nokogiri (~> 1.4)
|
|
47
|
+
ntlm-http (~> 0.1, >= 0.1.1)
|
|
48
|
+
webrobots (>= 0.0.9, < 0.2)
|
|
49
|
+
mime-types (2.4.3)
|
|
50
|
+
mini_portile (0.6.2)
|
|
51
|
+
minitest (5.6.0)
|
|
52
|
+
mysql2 (0.3.18)
|
|
53
|
+
net-http-digest_auth (1.4)
|
|
54
|
+
net-http-persistent (2.9.4)
|
|
55
|
+
net-scp (1.2.1)
|
|
56
|
+
net-ssh (>= 2.6.5)
|
|
57
|
+
net-ssh (2.9.2)
|
|
58
|
+
nokogiri (1.6.6.2)
|
|
59
|
+
mini_portile (~> 0.6.0)
|
|
60
|
+
ntlm-http (0.1.1)
|
|
61
|
+
rack (1.6.0)
|
|
62
|
+
rack-protection (1.5.3)
|
|
63
|
+
rack
|
|
64
|
+
rake (10.4.2)
|
|
65
|
+
rspec (3.2.0)
|
|
66
|
+
rspec-core (~> 3.2.0)
|
|
67
|
+
rspec-expectations (~> 3.2.0)
|
|
68
|
+
rspec-mocks (~> 3.2.0)
|
|
69
|
+
rspec-core (3.2.2)
|
|
70
|
+
rspec-support (~> 3.2.0)
|
|
71
|
+
rspec-expectations (3.2.0)
|
|
72
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
|
73
|
+
rspec-support (~> 3.2.0)
|
|
74
|
+
rspec-mocks (3.2.1)
|
|
75
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
|
76
|
+
rspec-support (~> 3.2.0)
|
|
77
|
+
rspec-support (3.2.2)
|
|
78
|
+
rvm1-capistrano3 (1.3.2.2)
|
|
79
|
+
capistrano (~> 3.0)
|
|
80
|
+
sshkit (>= 1.2)
|
|
81
|
+
sinatra (1.4.6)
|
|
82
|
+
rack (~> 1.4)
|
|
83
|
+
rack-protection (~> 1.4)
|
|
84
|
+
tilt (>= 1.3, < 3)
|
|
85
|
+
sshkit (1.7.1)
|
|
86
|
+
colorize (>= 0.7.0)
|
|
87
|
+
net-scp (>= 1.1.2)
|
|
88
|
+
net-ssh (>= 2.8.0)
|
|
89
|
+
thread_safe (0.3.5)
|
|
90
|
+
tilt (2.0.1)
|
|
91
|
+
trollop (2.1.2)
|
|
92
|
+
tzinfo (1.2.2)
|
|
93
|
+
thread_safe (~> 0.1)
|
|
94
|
+
unf (0.1.4)
|
|
95
|
+
unf_ext
|
|
96
|
+
unf_ext (0.0.6)
|
|
97
|
+
webrobots (0.1.1)
|
|
98
|
+
|
|
99
|
+
PLATFORMS
|
|
100
|
+
ruby
|
|
101
|
+
|
|
102
|
+
DEPENDENCIES
|
|
103
|
+
activerecord (~> 4.2.1)
|
|
104
|
+
activesupport (~> 4.2.1)
|
|
105
|
+
bundler (~> 1.9)
|
|
106
|
+
capistrano (~> 3.4.0)
|
|
107
|
+
capistrano-bundler (~> 1.1.4)
|
|
108
|
+
facets (~> 3.0.0)
|
|
109
|
+
i18n (~> 0.7.0)
|
|
110
|
+
mechanize (~> 2.7.3)
|
|
111
|
+
mysql2 (~> 0.3.18)
|
|
112
|
+
nokogiri (~> 1.6.6.2)
|
|
113
|
+
outrider!
|
|
114
|
+
rack (~> 1.6.0)
|
|
115
|
+
rake (~> 10.0)
|
|
116
|
+
rspec (~> 3.2.0)
|
|
117
|
+
rvm1-capistrano3 (~> 1.3.2.2)
|
|
118
|
+
sinatra (~> 1.4.6)
|
|
119
|
+
trollop (~> 2.1.2)
|
data/LICENSE.txt
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
The MIT License (MIT)
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2015 Jaap Badlands
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
|
13
|
+
all copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
# OUTRIDER
|
|
2
|
+
|
|
3
|
+
### Introduction
|
|
4
|
+
|
|
5
|
+
**Outrider Data Framework provides structure and tools for collecting, cleaning, storing and analysing data from around the web**
|
|
6
|
+
|
|
7
|
+
Built using Ruby and Python, Outrider's purpose is to provide an easy-to-use interface and set of tools to help create and run tasks that can programmatically visit, process, scrape, clean, store, analyse, access and display data from online sources.
|
|
8
|
+
|
|
9
|
+
Outrider projects are easily created using a rake command and your new project file automatically has access to the OutriderTools API and database. For examples
|
|
10
|
+
|
|
11
|
+
```ruby
|
|
12
|
+
# creates a new project file
|
|
13
|
+
rake project:build['nz_herald','http://nzherald.com']
|
|
14
|
+
|
|
15
|
+
# /projects/nz_herald/auxiliary.rb
|
|
16
|
+
# Returns scraped data from each page to be stored in the database
|
|
17
|
+
# using the Outrider storage format.
|
|
18
|
+
def crawl options
|
|
19
|
+
OutriderTools::Crawl::site( @config, ->(page, uri){
|
|
20
|
+
unless( page.css('.articleTitle').text.strip.empty? )
|
|
21
|
+
clean_date = DateTime.strptime(page.css('.storyDate').text.strip, '%a %b %d %H:%M:%S %Z %Y').to_s #Tue Mar 03 08:27:23 UTC 2015
|
|
22
|
+
return {
|
|
23
|
+
:title_raw => page.css('.articleTitle').text.strip,
|
|
24
|
+
:author => page.css('.authorName a').text.strip,
|
|
25
|
+
:content_raw => page.css('#articleBody p').map{ |paragraph| paragraph.text.strip }.to_json,
|
|
26
|
+
:date_published_raw => page.css('.storyDate').text.strip,
|
|
27
|
+
:date_published_timestamp => clean_date,
|
|
28
|
+
:status => 'scraped'
|
|
29
|
+
}
|
|
30
|
+
else
|
|
31
|
+
return {
|
|
32
|
+
:status => 'rejected'
|
|
33
|
+
}
|
|
34
|
+
end
|
|
35
|
+
})
|
|
36
|
+
end
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
### Features
|
|
41
|
+
|
|
42
|
+
| Feature | Purpose |
|
|
43
|
+
| ------- | ------- |
|
|
44
|
+
Data Mining | Outrider provides tools for **collecting**, **cleaning** and **storing data** from the web.
|
|
45
|
+
Statistical Analysis | Outrider provides libraries for running **statistical algorithms** over datasets.
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
## How it works
|
|
50
|
+
#### Command Line Interface
|
|
51
|
+
At it's basic level, Outrider provides a command line interface, whose commands give us the ability to call and pass arguments to our API. The Command line is used by running `./lib/ignite.rb`. When you call this file through your shell, you must pass it a command to run and any arguments to pass through to the command. Such as:
|
|
52
|
+
|
|
53
|
+
```shell
|
|
54
|
+
./lib/ignite.rb crawl -p project_name
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
#### API
|
|
58
|
+
The commands that can be recognised and run through the CLI form the *Outrider API*. The API sits behind and is accessed through the the command line. Actions that are common to the purposes of Outriders main goals - such as crawling and scraping, are made available publicly by the API and can be called by passing it as the first argument to `./lib/ignite.rb` when using the CLI. In the above example, the word *crawl* is a method of the Outrider API and `-p project_name` tells the API which project to look for the *crawl* implementation.
|
|
59
|
+
|
|
60
|
+
##### API Extension
|
|
61
|
+
The api can be extended by creating the functionality in your project's *auxiliary.rb* and modifying commandify.rb
|
|
62
|
+
|
|
63
|
+
For example:
|
|
64
|
+
|
|
65
|
+
```ruby
|
|
66
|
+
# in ./lib/project/:project_name/auxiliary.rb
|
|
67
|
+
def my_own_command( options )
|
|
68
|
+
# Here you have access to the OutriderTools module
|
|
69
|
+
# Also, options is a hash of the commands passed in through CLI
|
|
70
|
+
end
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
Commands are handled by a gem called Trollop
|
|
74
|
+
|
|
75
|
+
*IMPORTANT!*
|
|
76
|
+
* Do not modify the existing Trollop configuration
|
|
77
|
+
* Read the Trollop documentation at http://manageiq.github.io/trollop/
|
|
78
|
+
* Put your own configuration in the specified places - see below.
|
|
79
|
+
* Always run tests `rspec spec` after modifying this
|
|
80
|
+
* TODO - move this functionality into an setup where they don't have to touch commandify.rb
|
|
81
|
+
|
|
82
|
+
```ruby
|
|
83
|
+
# in ./lib/outrider/commandify.rb
|
|
84
|
+
module Commandify
|
|
85
|
+
def self.process
|
|
86
|
+
# Place custom command options here. See instructions at http://manageiq.github.io/trollop/
|
|
87
|
+
sub_commands << %w()
|
|
88
|
+
# Set these to accept arguments through the command line and pass them to your auxiliary methods
|
|
89
|
+
command_opts = Trollop::options do
|
|
90
|
+
# REQUIRED. Do not mess with the default options.
|
|
91
|
+
# Do not duplicate arguments or their short form.
|
|
92
|
+
# Run tests after modifying
|
|
93
|
+
# opt :domain, "The domain", :short => "-d", :type => String, :default => ''
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
# CUSTOM. Place custom command options here
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
end
|
|
100
|
+
end
|
|
101
|
+
end
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
##### In your command line
|
|
106
|
+
```shell
|
|
107
|
+
./lib/ignite.rb my_own_command -p project_name -your_argument_key value
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
###### Options
|
|
111
|
+
Once set up in commandify.rb, calling your new method in auxiliary.rb will pass in a hash of the options specified in the command line call. This means your auxiliary methods need to accept the options hash.
|
|
112
|
+
```ruby
|
|
113
|
+
def auxiliary_method( options )
|
|
114
|
+
# options contains a hash such as { :project => 'project_name' }
|
|
115
|
+
end
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
#### Projects
|
|
119
|
+
When working with Outrider, you first create and then work within a *project*. You can create as many of these as you want. These let us create custom functionality to handle different jobs uniquely.
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
#### Creating Projects
|
|
123
|
+
##### CLI
|
|
124
|
+
Projects can be created using the command line
|
|
125
|
+
``` shell
|
|
126
|
+
# Creates ./lib/projects/:project_name/auxiliary.rb
|
|
127
|
+
#and adds a record to the projects table in the database,
|
|
128
|
+
# including a seed entry to the raw_data table
|
|
129
|
+
./lib/ignite.rb create_project -p project_name
|
|
130
|
+
|
|
131
|
+
# Just adds a record to the projects table of the database
|
|
132
|
+
#and the seed entry in the raw_data table (doesn't create an auxialiary file)
|
|
133
|
+
./lib/ignite.rb create_project_db_row -p project_name -d http://domain.com
|
|
134
|
+
|
|
135
|
+
# Deletes the project folder and row in the project table of the database
|
|
136
|
+
./lib/ignite.rb delete_project -p project_name
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
##### RAKE
|
|
140
|
+
Outrider is assumed to run on two machines - a dev and a production server. When you create a new project you do so on the dev server and it gets added to git, however in order to make a project runnable in production, there must be some entries in the database for that. Outrider has a rake command that is run from the dev server which creates the files and db entry on the dev server and also creates the necessary database entries on the production server.
|
|
141
|
+
|
|
142
|
+
```shell
|
|
143
|
+
rake project:build['project_name','project_domain']
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
#### Customizing Projects
|
|
147
|
+
Once created, a project consist of a file `./lib/projects/:project_name/auxiliary.rb` which contains a class whose public methods correspond to the CLI commands.
|
|
148
|
+
|
|
149
|
+
```ruby
|
|
150
|
+
# lib/projects/test_project/auxiliary.rb
|
|
151
|
+
|
|
152
|
+
class TestProject < Project
|
|
153
|
+
|
|
154
|
+
def initialize
|
|
155
|
+
project_name :test_project
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
def crawl
|
|
159
|
+
# See OutriderTools documentation http://github.com/deadlysyntax/outrider
|
|
160
|
+
# You have full access to the OutriderTools module
|
|
161
|
+
# You inheret all the methods and instance variables defined in the global Project class
|
|
162
|
+
# Which also gives you access to @config which is a hash containing :id, :title and :domain of the project
|
|
163
|
+
|
|
164
|
+
OutriderTools::Crawl::site( @config, ->(page, uri){
|
|
165
|
+
# Use the Nokogiri page object here to do what you want to each page
|
|
166
|
+
})
|
|
167
|
+
end
|
|
168
|
+
end
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
## The process
|
|
174
|
+
A call to `./ignite.rb crawl -p test_project` will
|
|
175
|
+
1. Check the validity of the command against the API definition in **./lib/outrider/commandify.rb**,
|
|
176
|
+
2. In this case **crawl** is a legitimate API method, and since that passes it will then
|
|
177
|
+
3. Look in `./lib/project/test_project/auxiliary.rb` for a public method called **crawl**.
|
|
178
|
+
4. If it doesn't find it there, it will look in the global `Project` object
|
|
179
|
+
5. It will call **crawl** and pass in all the options specified in the command line (as long as they're set up in commandify)
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
# The OutriderTools module
|
|
184
|
+
Outrider Tools is module that provides an API for the core functionality at the the heart of the framework. It is loaded globally, so that you can call these functions from your projects' `auxiliary.rb` files.
|
|
185
|
+
|
|
186
|
+
### OutriderTools API
|
|
187
|
+
#### Crawl
|
|
188
|
+
##### site
|
|
189
|
+
```ruby
|
|
190
|
+
OutriderTools::Crawl::site( project, each_page_callback )
|
|
191
|
+
```
|
|
192
|
+
| Argument | Expected Value | Description |
|
|
193
|
+
| -------- | -------------- | ----------- |
|
|
194
|
+
**project** | { :id => 0, :title => '', :domain => '' } | A hash of project config values
|
|
195
|
+
**each_page_callback** | ->( page, uri ){} | A callback function to run, which gets passed the Nokogiri object and URI for each page
|
|
196
|
+
|
|
197
|
+
Recursively looks to the ProjectData in the database for the first `status: 'unscraped'` data record for the specified project. While at each page, it will run the callback and pass it the Nokogiri::HTML object and the current URI. It builds a list of sanitized urls and adds them as ProjectData rows in the database. Thus, recursively filtering through an entire domain and acting on each page
|
|
198
|
+
|
|
199
|
+
http://www.rubydoc.info/github/sparklemotion/nokogiri
|
|
200
|
+
|
|
201
|
+
________________________________
|
|
202
|
+
|
|
203
|
+
#### Scrape
|
|
204
|
+
```ruby
|
|
205
|
+
OutriderTools::Scrape::page( url, operate )
|
|
206
|
+
```
|
|
207
|
+
| Argument | Expected Value | Description |
|
|
208
|
+
| -------- | -------------- | ----------- |
|
|
209
|
+
**url** | "http://domain.com" | A url to scrape
|
|
210
|
+
**operate** | ->( page, uri ){} | A callback function to run, which gets passed the Nokogiri object and URI for each page
|
|
211
|
+
|
|
212
|
+
Will go to the URL and run the callback and pass it the Nokogiri::HTML object and the current URI.
|
|
213
|
+
|
|
214
|
+
http://www.rubydoc.info/github/sparklemotion/nokogiri
|
|
215
|
+
|
|
216
|
+
________________________________
|
|
217
|
+
|
|
218
|
+
# Installation
|
|
219
|
+
#### Git clone
|
|
220
|
+
|
|
221
|
+
Move to the directory you'd like to put the Outrider app.
|
|
222
|
+
|
|
223
|
+
> git clone git@github.com:deadlysyntax/outrider.git
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
#### Configuration
|
|
227
|
+
The following configuration files are required to be created are. Assuming ./ is app root
|
|
228
|
+
```ruby
|
|
229
|
+
# - config
|
|
230
|
+
# - - \ - database.yml
|
|
231
|
+
# - - - - hosts.yml
|
|
232
|
+
# - - \ - deploy.rb
|
|
233
|
+
# - - \ - deploy \
|
|
234
|
+
# - - \ - deploy \ - production.rb
|
|
235
|
+
```
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
#### Database
|
|
239
|
+
*TODO add migrations*
|
|
240
|
+
The mysql-based schema for the Outrider database is found at https://github.com/deadlysyntax/outrider/blob/master/config/schema.sql This is only a guide, and will in future be handled by Active Record.
|
|
241
|
+
|
|
242
|
+
Set up a database, import the schema (if using mysql) and create the following file **./config/database.yml**. This file is expected by the system and will not run without these steps being complete properly.
|
|
243
|
+
|
|
244
|
+
```yaml
|
|
245
|
+
host: localhost
|
|
246
|
+
username: root
|
|
247
|
+
password: root
|
|
248
|
+
database: outrider
|
|
249
|
+
adapter: mysql2
|
|
250
|
+
```
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
#### System Information
|
|
254
|
+
Requires Ruby 2.2.1. Outrider is run on two machines. The development machine and the remote server and deployed using Capistrano.
|
|
255
|
+
|
|
256
|
+
#### Tests
|
|
257
|
+
```shell
|
|
258
|
+
# in project root ./
|
|
259
|
+
rspec spec
|
|
260
|
+
```
|
|
261
|
+
|