outrider 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 41830dc2f20f2dca04892fa4a93db30a9a12286e
4
+ data.tar.gz: f8daadaa7e1bd18aecc62cc61b1650a18db96012
5
+ SHA512:
6
+ metadata.gz: 2058dddd555c90772157e5ee5c37f13d3cc93677aba9c2376a8d1526ddba06e6af58f5a031b807235ea9183edee53cb9da8df97e403ae769831e2a2c65d59a65
7
+ data.tar.gz: 08439e7eb5656870da32361b22372979be7a540528f13187a14826611da593ecf863073ad673885ab9bdbc5915b9ad6b3056890c28beafa8e57bae97075e99f0
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ #/Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /config/*
10
+ !/config/schema.sql
11
+ !/config/messages.yml
12
+ logfile.log
13
+ #/lib/projects/*
14
+ #!/lib/projects/
15
+ *.log
16
+ .DS_Store
17
+ /log/*
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --format documentation
2
+ --color
data/.ruby-version ADDED
@@ -0,0 +1 @@
1
+ 2.2.1
data/.travis.yml ADDED
@@ -0,0 +1,3 @@
1
+ language: ruby
2
+ rvm:
3
+ - 2.0.0
data/Capfile ADDED
@@ -0,0 +1,30 @@
1
+ # Load DSL and set up stages
2
+ require 'capistrano/setup'
3
+
4
+ # Include default deployment tasks
5
+ require 'capistrano/deploy'
6
+
7
+ # Include tasks from other gems included in your Gemfile
8
+ #
9
+ # For documentation on these, see for example:
10
+ #
11
+ # https://github.com/capistrano/rvm
12
+ # https://github.com/capistrano/rbenv
13
+ # https://github.com/capistrano/chruby
14
+ # https://github.com/capistrano/bundler
15
+ # https://github.com/capistrano/rails
16
+ # https://github.com/capistrano/passenger
17
+ #
18
+ # require 'capistrano/rvm'
19
+ # require 'capistrano/rbenv'
20
+ # require 'capistrano/chruby'
21
+ # require 'capistrano/bundler'
22
+ # require 'capistrano/rails/assets'
23
+ # require 'capistrano/rails/migrations'
24
+ # require 'capistrano/passenger'
25
+ # require 'capistrano/ssh_doctor'
26
+ require 'rvm1/capistrano3'
27
+ require 'capistrano/bundler'
28
+
29
+ # Load custom tasks from `lib/capistrano/tasks` if you have any defined
30
+ Dir.glob('lib/capistrano/tasks/*.rake').each { |r| import r }
data/Gemfile ADDED
@@ -0,0 +1,20 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in outrider_ruby.gemspec
4
+ gemspec
5
+
6
+ gem 'trollop', '~> 2.1.2'
7
+ gem 'nokogiri', '~> 1.6.6.2'
8
+ gem 'mechanize', '~> 2.7.3'
9
+ gem 'activesupport', '~> 4.2.1'
10
+ gem 'i18n', '~> 0.7.0'
11
+ gem 'activerecord', '~> 4.2.1'
12
+ gem 'mysql2', '~> 0.3.18'
13
+ gem 'rspec', '~> 3.2.0'
14
+ gem 'capistrano', '~> 3.4.0'
15
+ gem 'rvm1-capistrano3', '~> 1.3.2.2'
16
+ gem 'capistrano-bundler', '~> 1.1.4'
17
+ gem 'rack', '~> 1.6.0'
18
+ gem 'sinatra', '~> 1.4.6'
19
+ gem 'facets', '~> 3.0.0'
20
+
data/Gemfile.lock ADDED
@@ -0,0 +1,119 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ outrider (0.0.1)
5
+
6
+ GEM
7
+ remote: https://rubygems.org/
8
+ specs:
9
+ activemodel (4.2.1)
10
+ activesupport (= 4.2.1)
11
+ builder (~> 3.1)
12
+ activerecord (4.2.1)
13
+ activemodel (= 4.2.1)
14
+ activesupport (= 4.2.1)
15
+ arel (~> 6.0)
16
+ activesupport (4.2.1)
17
+ i18n (~> 0.7)
18
+ json (~> 1.7, >= 1.7.7)
19
+ minitest (~> 5.1)
20
+ thread_safe (~> 0.3, >= 0.3.4)
21
+ tzinfo (~> 1.1)
22
+ arel (6.0.0)
23
+ builder (3.2.2)
24
+ capistrano (3.4.0)
25
+ i18n
26
+ rake (>= 10.0.0)
27
+ sshkit (~> 1.3)
28
+ capistrano-bundler (1.1.4)
29
+ capistrano (~> 3.1)
30
+ sshkit (~> 1.2)
31
+ colorize (0.7.5)
32
+ diff-lcs (1.2.5)
33
+ domain_name (0.5.24)
34
+ unf (>= 0.0.5, < 1.0.0)
35
+ facets (3.0.0)
36
+ http-cookie (1.0.2)
37
+ domain_name (~> 0.5)
38
+ i18n (0.7.0)
39
+ json (1.8.2)
40
+ mechanize (2.7.3)
41
+ domain_name (~> 0.5, >= 0.5.1)
42
+ http-cookie (~> 1.0)
43
+ mime-types (~> 2.0)
44
+ net-http-digest_auth (~> 1.1, >= 1.1.1)
45
+ net-http-persistent (~> 2.5, >= 2.5.2)
46
+ nokogiri (~> 1.4)
47
+ ntlm-http (~> 0.1, >= 0.1.1)
48
+ webrobots (>= 0.0.9, < 0.2)
49
+ mime-types (2.4.3)
50
+ mini_portile (0.6.2)
51
+ minitest (5.6.0)
52
+ mysql2 (0.3.18)
53
+ net-http-digest_auth (1.4)
54
+ net-http-persistent (2.9.4)
55
+ net-scp (1.2.1)
56
+ net-ssh (>= 2.6.5)
57
+ net-ssh (2.9.2)
58
+ nokogiri (1.6.6.2)
59
+ mini_portile (~> 0.6.0)
60
+ ntlm-http (0.1.1)
61
+ rack (1.6.0)
62
+ rack-protection (1.5.3)
63
+ rack
64
+ rake (10.4.2)
65
+ rspec (3.2.0)
66
+ rspec-core (~> 3.2.0)
67
+ rspec-expectations (~> 3.2.0)
68
+ rspec-mocks (~> 3.2.0)
69
+ rspec-core (3.2.2)
70
+ rspec-support (~> 3.2.0)
71
+ rspec-expectations (3.2.0)
72
+ diff-lcs (>= 1.2.0, < 2.0)
73
+ rspec-support (~> 3.2.0)
74
+ rspec-mocks (3.2.1)
75
+ diff-lcs (>= 1.2.0, < 2.0)
76
+ rspec-support (~> 3.2.0)
77
+ rspec-support (3.2.2)
78
+ rvm1-capistrano3 (1.3.2.2)
79
+ capistrano (~> 3.0)
80
+ sshkit (>= 1.2)
81
+ sinatra (1.4.6)
82
+ rack (~> 1.4)
83
+ rack-protection (~> 1.4)
84
+ tilt (>= 1.3, < 3)
85
+ sshkit (1.7.1)
86
+ colorize (>= 0.7.0)
87
+ net-scp (>= 1.1.2)
88
+ net-ssh (>= 2.8.0)
89
+ thread_safe (0.3.5)
90
+ tilt (2.0.1)
91
+ trollop (2.1.2)
92
+ tzinfo (1.2.2)
93
+ thread_safe (~> 0.1)
94
+ unf (0.1.4)
95
+ unf_ext
96
+ unf_ext (0.0.6)
97
+ webrobots (0.1.1)
98
+
99
+ PLATFORMS
100
+ ruby
101
+
102
+ DEPENDENCIES
103
+ activerecord (~> 4.2.1)
104
+ activesupport (~> 4.2.1)
105
+ bundler (~> 1.9)
106
+ capistrano (~> 3.4.0)
107
+ capistrano-bundler (~> 1.1.4)
108
+ facets (~> 3.0.0)
109
+ i18n (~> 0.7.0)
110
+ mechanize (~> 2.7.3)
111
+ mysql2 (~> 0.3.18)
112
+ nokogiri (~> 1.6.6.2)
113
+ outrider!
114
+ rack (~> 1.6.0)
115
+ rake (~> 10.0)
116
+ rspec (~> 3.2.0)
117
+ rvm1-capistrano3 (~> 1.3.2.2)
118
+ sinatra (~> 1.4.6)
119
+ trollop (~> 2.1.2)
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2015 Jaap Badlands
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,261 @@
1
+ # OUTRIDER
2
+
3
+ ### Introduction
4
+
5
+ **Outrider Data Framework provides structure and tools for collecting, cleaning, storing and analysing data from around the web**
6
+
7
+ Built using Ruby and Python, Outrider's purpose is to provide an easy-to-use interface and set of tools to help create and run tasks that can programmatically visit, process, scrape, clean, store, analyse, access and display data from online sources.
8
+
9
+ Outrider projects are easily created using a rake command and your new project file automatically has access to the OutriderTools API and database. For examples
10
+
11
+ ```ruby
12
+ # creates a new project file
13
+ rake project:build['nz_herald','http://nzherald.com']
14
+
15
+ # /projects/nz_herald/auxiliary.rb
16
+ # Returns scraped data from each page to be stored in the database
17
+ # using the Outrider storage format.
18
+ def crawl options
19
+ OutriderTools::Crawl::site( @config, ->(page, uri){
20
+ unless( page.css('.articleTitle').text.strip.empty? )
21
+ clean_date = DateTime.strptime(page.css('.storyDate').text.strip, '%a %b %d %H:%M:%S %Z %Y').to_s #Tue Mar 03 08:27:23 UTC 2015
22
+ return {
23
+ :title_raw => page.css('.articleTitle').text.strip,
24
+ :author => page.css('.authorName a').text.strip,
25
+ :content_raw => page.css('#articleBody p').map{ |paragraph| paragraph.text.strip }.to_json,
26
+ :date_published_raw => page.css('.storyDate').text.strip,
27
+ :date_published_timestamp => clean_date,
28
+ :status => 'scraped'
29
+ }
30
+ else
31
+ return {
32
+ :status => 'rejected'
33
+ }
34
+ end
35
+ })
36
+ end
37
+ ```
38
+
39
+
40
+ ### Features
41
+
42
+ | Feature | Purpose |
43
+ | ------- | ------- |
44
+ Data Mining | Outrider provides tools for **collecting**, **cleaning** and **storing data** from the web.
45
+ Statistical Analysis | Outrider provides libraries for running **statistical algorithms** over datasets.
46
+
47
+
48
+
49
+ ## How it works
50
+ #### Command Line Interface
51
+ At it's basic level, Outrider provides a command line interface, whose commands give us the ability to call and pass arguments to our API. The Command line is used by running `./lib/ignite.rb`. When you call this file through your shell, you must pass it a command to run and any arguments to pass through to the command. Such as:
52
+
53
+ ```shell
54
+ ./lib/ignite.rb crawl -p project_name
55
+ ```
56
+
57
+ #### API
58
+ The commands that can be recognised and run through the CLI form the *Outrider API*. The API sits behind and is accessed through the the command line. Actions that are common to the purposes of Outriders main goals - such as crawling and scraping, are made available publicly by the API and can be called by passing it as the first argument to `./lib/ignite.rb` when using the CLI. In the above example, the word *crawl* is a method of the Outrider API and `-p project_name` tells the API which project to look for the *crawl* implementation.
59
+
60
+ ##### API Extension
61
+ The api can be extended by creating the functionality in your project's *auxiliary.rb* and modifying commandify.rb
62
+
63
+ For example:
64
+
65
+ ```ruby
66
+ # in ./lib/project/:project_name/auxiliary.rb
67
+ def my_own_command( options )
68
+ # Here you have access to the OutriderTools module
69
+ # Also, options is a hash of the commands passed in through CLI
70
+ end
71
+ ```
72
+
73
+ Commands are handled by a gem called Trollop
74
+
75
+ *IMPORTANT!*
76
+ * Do not modify the existing Trollop configuration
77
+ * Read the Trollop documentation at http://manageiq.github.io/trollop/
78
+ * Put your own configuration in the specified places - see below.
79
+ * Always run tests `rspec spec` after modifying this
80
+ * TODO - move this functionality into an setup where they don't have to touch commandify.rb
81
+
82
+ ```ruby
83
+ # in ./lib/outrider/commandify.rb
84
+ module Commandify
85
+ def self.process
86
+ # Place custom command options here. See instructions at http://manageiq.github.io/trollop/
87
+ sub_commands << %w()
88
+ # Set these to accept arguments through the command line and pass them to your auxiliary methods
89
+ command_opts = Trollop::options do
90
+ # REQUIRED. Do not mess with the default options.
91
+ # Do not duplicate arguments or their short form.
92
+ # Run tests after modifying
93
+ # opt :domain, "The domain", :short => "-d", :type => String, :default => ''
94
+
95
+
96
+ # CUSTOM. Place custom command options here
97
+
98
+
99
+ end
100
+ end
101
+ end
102
+ ```
103
+
104
+
105
+ ##### In your command line
106
+ ```shell
107
+ ./lib/ignite.rb my_own_command -p project_name -your_argument_key value
108
+ ```
109
+
110
+ ###### Options
111
+ Once set up in commandify.rb, calling your new method in auxiliary.rb will pass in a hash of the options specified in the command line call. This means your auxiliary methods need to accept the options hash.
112
+ ```ruby
113
+ def auxiliary_method( options )
114
+ # options contains a hash such as { :project => 'project_name' }
115
+ end
116
+ ```
117
+
118
+ #### Projects
119
+ When working with Outrider, you first create and then work within a *project*. You can create as many of these as you want. These let us create custom functionality to handle different jobs uniquely.
120
+
121
+
122
+ #### Creating Projects
123
+ ##### CLI
124
+ Projects can be created using the command line
125
+ ``` shell
126
+ # Creates ./lib/projects/:project_name/auxiliary.rb
127
+ #and adds a record to the projects table in the database,
128
+ # including a seed entry to the raw_data table
129
+ ./lib/ignite.rb create_project -p project_name
130
+
131
+ # Just adds a record to the projects table of the database
132
+ #and the seed entry in the raw_data table (doesn't create an auxialiary file)
133
+ ./lib/ignite.rb create_project_db_row -p project_name -d http://domain.com
134
+
135
+ # Deletes the project folder and row in the project table of the database
136
+ ./lib/ignite.rb delete_project -p project_name
137
+ ```
138
+
139
+ ##### RAKE
140
+ Outrider is assumed to run on two machines - a dev and a production server. When you create a new project you do so on the dev server and it gets added to git, however in order to make a project runnable in production, there must be some entries in the database for that. Outrider has a rake command that is run from the dev server which creates the files and db entry on the dev server and also creates the necessary database entries on the production server.
141
+
142
+ ```shell
143
+ rake project:build['project_name','project_domain']
144
+ ```
145
+
146
+ #### Customizing Projects
147
+ Once created, a project consist of a file `./lib/projects/:project_name/auxiliary.rb` which contains a class whose public methods correspond to the CLI commands.
148
+
149
+ ```ruby
150
+ # lib/projects/test_project/auxiliary.rb
151
+
152
+ class TestProject < Project
153
+
154
+ def initialize
155
+ project_name :test_project
156
+ end
157
+
158
+ def crawl
159
+ # See OutriderTools documentation http://github.com/deadlysyntax/outrider
160
+ # You have full access to the OutriderTools module
161
+ # You inheret all the methods and instance variables defined in the global Project class
162
+ # Which also gives you access to @config which is a hash containing :id, :title and :domain of the project
163
+
164
+ OutriderTools::Crawl::site( @config, ->(page, uri){
165
+ # Use the Nokogiri page object here to do what you want to each page
166
+ })
167
+ end
168
+ end
169
+ ```
170
+
171
+
172
+
173
+ ## The process
174
+ A call to `./ignite.rb crawl -p test_project` will
175
+ 1. Check the validity of the command against the API definition in **./lib/outrider/commandify.rb**,
176
+ 2. In this case **crawl** is a legitimate API method, and since that passes it will then
177
+ 3. Look in `./lib/project/test_project/auxiliary.rb` for a public method called **crawl**.
178
+ 4. If it doesn't find it there, it will look in the global `Project` object
179
+ 5. It will call **crawl** and pass in all the options specified in the command line (as long as they're set up in commandify)
180
+
181
+
182
+
183
+ # The OutriderTools module
184
+ Outrider Tools is module that provides an API for the core functionality at the the heart of the framework. It is loaded globally, so that you can call these functions from your projects' `auxiliary.rb` files.
185
+
186
+ ### OutriderTools API
187
+ #### Crawl
188
+ ##### site
189
+ ```ruby
190
+ OutriderTools::Crawl::site( project, each_page_callback )
191
+ ```
192
+ | Argument | Expected Value | Description |
193
+ | -------- | -------------- | ----------- |
194
+ **project** | { :id => 0, :title => '', :domain => '' } | A hash of project config values
195
+ **each_page_callback** | ->( page, uri ){} | A callback function to run, which gets passed the Nokogiri object and URI for each page
196
+
197
+ Recursively looks to the ProjectData in the database for the first `status: 'unscraped'` data record for the specified project. While at each page, it will run the callback and pass it the Nokogiri::HTML object and the current URI. It builds a list of sanitized urls and adds them as ProjectData rows in the database. Thus, recursively filtering through an entire domain and acting on each page
198
+
199
+ http://www.rubydoc.info/github/sparklemotion/nokogiri
200
+
201
+ ________________________________
202
+
203
+ #### Scrape
204
+ ```ruby
205
+ OutriderTools::Scrape::page( url, operate )
206
+ ```
207
+ | Argument | Expected Value | Description |
208
+ | -------- | -------------- | ----------- |
209
+ **url** | "http://domain.com" | A url to scrape
210
+ **operate** | ->( page, uri ){} | A callback function to run, which gets passed the Nokogiri object and URI for each page
211
+
212
+ Will go to the URL and run the callback and pass it the Nokogiri::HTML object and the current URI.
213
+
214
+ http://www.rubydoc.info/github/sparklemotion/nokogiri
215
+
216
+ ________________________________
217
+
218
+ # Installation
219
+ #### Git clone
220
+
221
+ Move to the directory you'd like to put the Outrider app.
222
+
223
+ > git clone git@github.com:deadlysyntax/outrider.git
224
+
225
+
226
+ #### Configuration
227
+ The following configuration files are required to be created are. Assuming ./ is app root
228
+ ```ruby
229
+ # - config
230
+ # - - \ - database.yml
231
+ # - - - - hosts.yml
232
+ # - - \ - deploy.rb
233
+ # - - \ - deploy \
234
+ # - - \ - deploy \ - production.rb
235
+ ```
236
+
237
+
238
+ #### Database
239
+ *TODO add migrations*
240
+ The mysql-based schema for the Outrider database is found at https://github.com/deadlysyntax/outrider/blob/master/config/schema.sql This is only a guide, and will in future be handled by Active Record.
241
+
242
+ Set up a database, import the schema (if using mysql) and create the following file **./config/database.yml**. This file is expected by the system and will not run without these steps being complete properly.
243
+
244
+ ```yaml
245
+ host: localhost
246
+ username: root
247
+ password: root
248
+ database: outrider
249
+ adapter: mysql2
250
+ ```
251
+
252
+
253
+ #### System Information
254
+ Requires Ruby 2.2.1. Outrider is run on two machines. The development machine and the remote server and deployed using Capistrano.
255
+
256
+ #### Tests
257
+ ```shell
258
+ # in project root ./
259
+ rspec spec
260
+ ```
261
+