mine_shaft 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +3 -0
- data/LICENSE +20 -0
- data/README.textile +93 -0
- data/Rakefile +9 -0
- data/lib/mine_shaft.rb +13 -0
- data/lib/mine_shaft/errors.rb +5 -0
- data/lib/mine_shaft/html_table.rb +101 -0
- data/lib/mine_shaft/login_page.rb +51 -0
- data/lib/mine_shaft/shaft.rb +49 -0
- data/lib/mine_shaft/user_agent.rb +109 -0
- data/lib/mine_shaft/version.rb +3 -0
- data/lib/mine_shaft/web_page.rb +31 -0
- data/spec/fixtures/failed_login.html +118 -0
- data/spec/fixtures/home_page.html +3 -0
- data/spec/fixtures/login.html +117 -0
- data/spec/fixtures/multiple_tables.html +212 -0
- data/spec/fixtures/projects_table.html +16 -0
- data/spec/fixtures/wiki_page_with_multiple_tables.html +193 -0
- data/spec/fixtures/wiki_page_with_no_table_id.html +176 -0
- data/spec/fixtures/wiki_page_with_projects_table.html +176 -0
- data/spec/mine_shaft/html_table_spec.rb +60 -0
- data/spec/mine_shaft/shaft_spec.rb +56 -0
- data/spec/mine_shaft/user_agent_spec.rb +62 -0
- data/spec/mine_shaft/web_page_spec.rb +25 -0
- data/spec/spec.opts +4 -0
- data/spec/spec_helper.rb +8 -0
- data/spec/support/mine_shaft_helpers.rb +9 -0
- metadata +169 -0
data/CHANGELOG
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2010 Tom Kersten, http://tomkersten.com/
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.textile
ADDED
@@ -0,0 +1,93 @@
|
|
1
|
+
h1. MineShaft: Seed data from your stakeholders
|
2
|
+
|
3
|
+
h2. Overview
|
4
|
+
|
5
|
+
MineShaft uses @mechanize@ to walk through a Remine project site and parse content
|
6
|
+
which can't be easily accessed via the REST API (yet). The gem also provides a
|
7
|
+
simple way to deserialize an HTML table into an Array of Hash objects which are
|
8
|
+
key-value pairs of the table heading(s) and the corresponding value for each row
|
9
|
+
in the table.
|
10
|
+
|
11
|
+
h2. Motivation
|
12
|
+
|
13
|
+
We were looking for some domain-specific information to work with in an
|
14
|
+
application and wanted to feed off of the knowledge of some of our stakeholders.
|
15
|
+
We didn't want to build out an admin interface which would be replaced with an
|
16
|
+
automated system down the road and having our users edit text files and send
|
17
|
+
them to us was not going to work. We created a @Seed_Data@ wiki page and threw
|
18
|
+
in a few tables with ID's and wrote MineShaft to parse those tables and convert
|
19
|
+
them into Hashes we could use to add/update content in the application database.
|
20
|
+
Using this method allow(s) us to spread data entry & review requests for to a
|
21
|
+
wide variety of people in an efficient manner.
|
22
|
+
|
23
|
+
h2. Usage
|
24
|
+
|
25
|
+
Assuming you have a Redmine installation set up at @http://your.rm-install.com@
|
26
|
+
and a project named @twitter4biz@ set up.
|
27
|
+
|
28
|
+
Given a wiki page titled "Seed_Data" with the following table definition
|
29
|
+
_somewhere_ in that page (note: the table ID must be present):
|
30
|
+
|
31
|
+
bc. table(#companies).
|
32
|
+
|Ticker|Name |
|
33
|
+
|AAPL |Apple |
|
34
|
+
|MSFT |Microsoft|
|
35
|
+
|GOOG |Google |
|
36
|
+
|YHOO |Yahoo |
|
37
|
+
|
38
|
+
Then in IRB (or whatever):
|
39
|
+
|
40
|
+
bc. require 'mine_shaft'
|
41
|
+
include MineShaft
|
42
|
+
shaft = Shaft.new('rm-username', 'rm-password', 'http://your.rm-install.com')
|
43
|
+
companies = shaft.grab("companies", '/projects/twitter4biz/wiki/Seed_Data')
|
44
|
+
=> [{:name => 'Apple', :ticker => 'AAPL'}, {:name => 'Microsoft', :ticker => 'MSFT'},...]
|
45
|
+
|
46
|
+
So, in the db/seeds.rb file of your Rails app, you could put something like the
|
47
|
+
following (assuming you have also included the previous code example):
|
48
|
+
|
49
|
+
bc. companies.each do |attributes|
|
50
|
+
company = Company.find_by_ticker(attributes[:ticker])
|
51
|
+
if company.nil?
|
52
|
+
Company.create!(attributes)
|
53
|
+
puts "Added '#{attributes[:ticker]}'"
|
54
|
+
else
|
55
|
+
ticker = attributes.delete(:ticker)
|
56
|
+
company.update(attributes)
|
57
|
+
puts "Updated '#{ticker}'"
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
...and then run:
|
62
|
+
|
63
|
+
bc. rake db:seed
|
64
|
+
|
65
|
+
h2. Installation
|
66
|
+
|
67
|
+
bc. gem install mine_shaft
|
68
|
+
|
69
|
+
h2. Contributing
|
70
|
+
|
71
|
+
# Fork it...
|
72
|
+
# bundle install
|
73
|
+
# ...make awesomeness
|
74
|
+
# Commit (ideally) to a feature-branch
|
75
|
+
# Send a pull request
|
76
|
+
|
77
|
+
h2. Found a bug?
|
78
|
+
|
79
|
+
File an issue on the project's "issues page":https://github.com/gn-research/mine_shaft/issues
|
80
|
+
|
81
|
+
h2. Dependencies
|
82
|
+
|
83
|
+
* mechanize
|
84
|
+
|
85
|
+
h2. License
|
86
|
+
|
87
|
+
Refer to LICENSE file (hint: MIT)
|
88
|
+
|
89
|
+
h2. Future Plans
|
90
|
+
|
91
|
+
The gem is meeting our needs at the moment, so we don't have any plans to add
|
92
|
+
significant functionality at the moment. However, it has come in quite handy
|
93
|
+
so far, so we may end up expanding it further if a new need arises.
|
data/Rakefile
ADDED
data/lib/mine_shaft.rb
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
# Redmine doesn't support XML auth to wiki pages...we need to get dirty w/ it
|
2
|
+
|
3
|
+
require 'mechanize'
|
4
|
+
require 'mine_shaft/errors'
|
5
|
+
require 'mine_shaft/html_table'
|
6
|
+
require 'mine_shaft/login_page'
|
7
|
+
require 'mine_shaft/shaft'
|
8
|
+
require 'mine_shaft/user_agent'
|
9
|
+
require 'mine_shaft/web_page'
|
10
|
+
|
11
|
+
module MineShaft
|
12
|
+
include Errors
|
13
|
+
end
|
@@ -0,0 +1,101 @@
|
|
1
|
+
module MineShaft
|
2
|
+
# Provides several convenience methods for translating a (machinist-) parsed
|
3
|
+
# HTML table into standard Ruby data structures. All tables are assumed to
|
4
|
+
# have a "heading" row as the first row, and that header uses <td> elements
|
5
|
+
# (instead of <th>).
|
6
|
+
class HTMLTable
|
7
|
+
# Public: Initialize a new HTMLTable with the specified table-data as parse
|
8
|
+
# by machinist (or Nokogiri).
|
9
|
+
#
|
10
|
+
# parsed_table - A Nokogiri::HTML::Document or Nokogiri::XML::Element scoped
|
11
|
+
# to only the HTML table you are interested in. Technically
|
12
|
+
# speaking, you could pass in more content than just the
|
13
|
+
# <table> element and it would likely work fine, but that is
|
14
|
+
# the anticipated content structure.
|
15
|
+
#
|
16
|
+
# Returns an instance of HTMLTable
|
17
|
+
def initialize(parsed_table)
|
18
|
+
@table = parsed_table
|
19
|
+
end
|
20
|
+
|
21
|
+
# Public: Retrieve the content of all the <td> elements from the table,
|
22
|
+
# except for the first row.
|
23
|
+
#
|
24
|
+
# Returns an Array of Array elements, each one being the content from one
|
25
|
+
# row of the table. The returned content does NOT include the first row,
|
26
|
+
# as it is assumed to be the heading of the table.
|
27
|
+
def content_rows
|
28
|
+
table_content = td_elements[column_count, td_elements.size]
|
29
|
+
table_content.enum_slice(column_count).to_a
|
30
|
+
end
|
31
|
+
|
32
|
+
# Public: Converts HTML table to an Array of Hash objects, using the column
|
33
|
+
# headings as keys for each Hash element.
|
34
|
+
#
|
35
|
+
# Examples
|
36
|
+
#
|
37
|
+
# Given 'names' was initialized with the following table:
|
38
|
+
#
|
39
|
+
# ---------------------
|
40
|
+
# |Name |Number |
|
41
|
+
# ---------------------
|
42
|
+
# |John |123-456-7890|
|
43
|
+
# ---------------------
|
44
|
+
#
|
45
|
+
# names.deserialize
|
46
|
+
# # => [{:name => "John", :number => "123-456-7890"}]
|
47
|
+
#
|
48
|
+
# Returns an Array of Hash objects. Each Hash element is a
|
49
|
+
# key-value mapping of "table header"-"row content". (Note that the
|
50
|
+
# the key is a downcased-symbol of the heading value).
|
51
|
+
def deserialize
|
52
|
+
content_rows.map do |row_cells|
|
53
|
+
symbolized_headings.inject({}) do |all_attributes, current_attribute|
|
54
|
+
index_of_header = symbolized_headings.index(current_attribute)
|
55
|
+
value = row_cells[index_of_header]
|
56
|
+
all_attributes.merge({current_attribute.to_sym => value})
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
# Public: Retrieves the content from all <td> elements in the table.
|
62
|
+
#
|
63
|
+
# Returns an Array of the content contained in each <td> element.
|
64
|
+
def td_elements
|
65
|
+
@table.search("td").map(&:content)
|
66
|
+
end
|
67
|
+
|
68
|
+
# Public: Retrieves the content from the <td> elements of the first row of
|
69
|
+
# the table.
|
70
|
+
#
|
71
|
+
# Returns an Array of the content contained in each <td> element of the
|
72
|
+
# first row.
|
73
|
+
def headings
|
74
|
+
td_elements.slice(0,column_count)
|
75
|
+
end
|
76
|
+
alias :headers :headings
|
77
|
+
|
78
|
+
# Public: Converts the return value of #headings to an Array of
|
79
|
+
# lower-cased Symbol elements.
|
80
|
+
#
|
81
|
+
# Returns an Array of Symbol elements.
|
82
|
+
def symbolized_headings
|
83
|
+
headings.map {|header| header.downcase.to_sym}
|
84
|
+
end
|
85
|
+
|
86
|
+
private
|
87
|
+
# Counts the number of columns in the table.
|
88
|
+
#
|
89
|
+
# Returns the number of columns.
|
90
|
+
def column_count
|
91
|
+
td_elements.count / row_count
|
92
|
+
end
|
93
|
+
|
94
|
+
# Counts the number of rows in the table.
|
95
|
+
#
|
96
|
+
# Returns the number of rows.
|
97
|
+
def row_count
|
98
|
+
@table.search("tr").count
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
module MineShaft
|
2
|
+
# Collection of methods applicable to the login page, essentially simplifying
|
3
|
+
# the process of interacting with the login form for signing in.
|
4
|
+
class LoginPage
|
5
|
+
# The relative URL for the login page on a Redmine site
|
6
|
+
LOGIN_FORM_ACTION = '/login'
|
7
|
+
|
8
|
+
# Public: Instantiates a new LoginPage object.
|
9
|
+
#
|
10
|
+
# page - A Nokogiri::HTML::Document.
|
11
|
+
#
|
12
|
+
# Returns a new instance of a LoginPage.
|
13
|
+
# Raises InvalidPage if the specified page does not contain the login form.
|
14
|
+
def initialize(page)
|
15
|
+
@page = page
|
16
|
+
raise InvalidPage, "Page specified does not appear to be the login page" if !login_page?
|
17
|
+
end
|
18
|
+
|
19
|
+
# Public: Confirms whether the specified page is the Redmine login page or
|
20
|
+
# not.
|
21
|
+
#
|
22
|
+
# page - A Nokogiri::HTML::Document.
|
23
|
+
#
|
24
|
+
# Returns true if the specified page is the login page.
|
25
|
+
# Returns false if the specified page is not the login page.
|
26
|
+
def self.valid?(page)
|
27
|
+
begin
|
28
|
+
new(page)
|
29
|
+
return true
|
30
|
+
rescue InvalidPage
|
31
|
+
return false
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
# Public: Retrieves the login form from the page.
|
36
|
+
#
|
37
|
+
# Returns an instance of the Mechanize::Form class.
|
38
|
+
def login_form
|
39
|
+
@login_form ||= @page.forms.find {|f| f.action == LOGIN_FORM_ACTION}
|
40
|
+
end
|
41
|
+
|
42
|
+
private
|
43
|
+
# Confirms whether the page is the login page or not
|
44
|
+
#
|
45
|
+
# Returns true if the login form was found
|
46
|
+
# Returns false if the login form was not found
|
47
|
+
def login_page?
|
48
|
+
!!login_form
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
module MineShaft
|
2
|
+
# Provides simple interface for deserializing an id'd HTML table from a
|
3
|
+
# specific page of a Redmine project site.
|
4
|
+
class Shaft
|
5
|
+
# The relative URL for the login page on a Redmine site
|
6
|
+
LOGIN_PAGE_URL = "/login"
|
7
|
+
|
8
|
+
# Public: Initializes new instance of Shaft class.
|
9
|
+
#
|
10
|
+
# username - The username to log in with on the specified Redmine site.
|
11
|
+
# password - The password to log in with on the specified Redmine site.
|
12
|
+
# base_uri - The URL of the Redmine installation.
|
13
|
+
#
|
14
|
+
# Examples
|
15
|
+
#
|
16
|
+
# shaft = Shaft.new('uname', 'password', 'http://myredmineinstall.com')
|
17
|
+
#
|
18
|
+
# Returns a new instance of the Shaft class.
|
19
|
+
def initialize(username, password, base_uri)
|
20
|
+
@agent = UserAgent.new(username, password, base_uri)
|
21
|
+
@login_action = @login_page = LOGIN_PAGE_URL
|
22
|
+
end
|
23
|
+
|
24
|
+
# Public: Logs in and parses the specified page for a <table> with the
|
25
|
+
# specified ID.
|
26
|
+
#
|
27
|
+
# table_id - The HTML id of the desired table as a String.
|
28
|
+
# relative_wiki_page_url - The relative URL of the page within the Redmine
|
29
|
+
# site.
|
30
|
+
#
|
31
|
+
# Examples
|
32
|
+
#
|
33
|
+
# shaft.grab('names', '/projects/name-parser/Wiki/Name_Data')
|
34
|
+
#
|
35
|
+
# Returns an Array of Hash objects. Each Hash element is a
|
36
|
+
# key-value mapping of "table header"-"row content". (Note that the
|
37
|
+
# the key is a downcased-symbol of the heading value).
|
38
|
+
# Raises FailedLogin if the login failed
|
39
|
+
# Raises InvalidPage if the login page of the site renders a 404
|
40
|
+
# OR if a table with the supplied ID is not found on the
|
41
|
+
# specified page.
|
42
|
+
def grab(table_id, relative_wiki_page_url)
|
43
|
+
@agent.log_in
|
44
|
+
wiki_page = WebPage.new(@agent.get(relative_wiki_page_url))
|
45
|
+
requested_table = wiki_page.find_table(table_id)
|
46
|
+
requested_table.deserialize
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,109 @@
|
|
1
|
+
module MineShaft
|
2
|
+
# Acts as a headless browser to log in and interact with a Redmine site.
|
3
|
+
class UserAgent
|
4
|
+
# The action on relative URL to hit for the login page on a Redmine site.
|
5
|
+
LOGIN_ACTION = '/login'
|
6
|
+
|
7
|
+
# Public: Creates a new instance of the UserAgent class.
|
8
|
+
#
|
9
|
+
# username - The username to use for logging into the Redmine site.
|
10
|
+
# password - The password to use for logging into the Redmine site.
|
11
|
+
# base_uri - The URL of the Redmine site.
|
12
|
+
#
|
13
|
+
# Returns a new instance of the UserAgent class.
|
14
|
+
def initialize(username, password, base_uri)
|
15
|
+
@username = username
|
16
|
+
@password = password
|
17
|
+
@base_uri = base_uri
|
18
|
+
@agent = Mechanize.new
|
19
|
+
@logged_in = false
|
20
|
+
end
|
21
|
+
|
22
|
+
# Public: Retrieves the specified page from the Redmine site.
|
23
|
+
#
|
24
|
+
# page - The relative URL of the page to retrieve as a String.
|
25
|
+
#
|
26
|
+
# Returns the page pased into a Mechanize::Page object.
|
27
|
+
def get(page)
|
28
|
+
@current_page = @agent.get("#{@base_uri}#{page}")
|
29
|
+
end
|
30
|
+
|
31
|
+
# Public: Logs into the Redmine site using credentials specified on object
|
32
|
+
# instantiation.
|
33
|
+
#
|
34
|
+
# Returns true if login process was successful.
|
35
|
+
# Raises FailedLogin if the login was not successful.
|
36
|
+
# Raises InvalidPage if the specified site returns a 404 response code.
|
37
|
+
def log_in
|
38
|
+
return true if logged_in?
|
39
|
+
fill_out_login_form
|
40
|
+
submit(login_form, login_form.buttons.first)
|
41
|
+
|
42
|
+
if back_on_login_page?
|
43
|
+
raise FailedLogin, "Login failed. Please verify username & password"
|
44
|
+
end
|
45
|
+
@logged_in = true
|
46
|
+
rescue Mechanize::ResponseCodeError
|
47
|
+
raise InvalidPage, "'#{@base_uri}' is returning a 404. Please verify the URL is a functioning Redmine installation"
|
48
|
+
end
|
49
|
+
|
50
|
+
# Public: Submits the specified form by clicking the specified button on
|
51
|
+
# said form.
|
52
|
+
#
|
53
|
+
# form - A Mechanize::Form object.
|
54
|
+
# button - A Mechanize::Form::Submit object.
|
55
|
+
#
|
56
|
+
# Returns the page resulting from the submission process as a
|
57
|
+
# Mechanize::Page object.
|
58
|
+
def submit(form, button)
|
59
|
+
@current_page = @agent.submit(form, button)
|
60
|
+
end
|
61
|
+
|
62
|
+
# Public: Confirms whether or not the UserAgent instance is logged in.
|
63
|
+
#
|
64
|
+
# Returns true if the UserAgent is currently logged in.
|
65
|
+
# Returns false if the UserAgent is not currently logged in.
|
66
|
+
def logged_in?
|
67
|
+
@logged_in
|
68
|
+
end
|
69
|
+
|
70
|
+
private
|
71
|
+
# Enters the username & password specified during instantiation into the
|
72
|
+
# username & password text fields of the login form.
|
73
|
+
#
|
74
|
+
# Returns nothing.
|
75
|
+
def fill_out_login_form
|
76
|
+
login_form.username = @username
|
77
|
+
login_form.password = @password
|
78
|
+
nil
|
79
|
+
end
|
80
|
+
|
81
|
+
# Convenience method to retrieve the login form ffrom the Redmine
|
82
|
+
# installation's login page.
|
83
|
+
#
|
84
|
+
# Returns the login form (caches form object, so you can interact with the
|
85
|
+
# return value directly on the method).
|
86
|
+
def login_form
|
87
|
+
@login_form ||= load_login_page && @login_page.login_form
|
88
|
+
end
|
89
|
+
|
90
|
+
# Retrieves and caches the login page of the Redmine installation.
|
91
|
+
#
|
92
|
+
# Returns a LoginPage object (object is cached, so you can interact with
|
93
|
+
# the return value directly on the method).
|
94
|
+
def load_login_page
|
95
|
+
@login_page ||= LoginPage.new(get(LOGIN_ACTION))
|
96
|
+
end
|
97
|
+
|
98
|
+
# Confirms whether the current page is the login page...used to detect if
|
99
|
+
# a login failed.
|
100
|
+
#
|
101
|
+
# Returns true if the current page is the Redmine installation's login
|
102
|
+
# page
|
103
|
+
# Returns false if the current page is not the Redmine installation's login
|
104
|
+
# page
|
105
|
+
def back_on_login_page?
|
106
|
+
return LoginPage.valid?(@current_page)
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|