trackit_scraper 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,22 +1,22 @@
1
- class LoginCommand
2
-
3
- def initialize(username, password, navigator)
4
- @username = username
5
- @password = password
6
- @navigator = navigator
7
- end
8
-
9
- def execute(b)
10
- @navigator.goto 'hd/index.ssp', b
11
- login b
12
- end
13
-
14
- private
15
-
16
- def login(b)
17
- b.text_field(name: 'user_id').set @username
18
- b.text_field(name: 'user_pwd').set @password
19
- b.button(value: 'Log on').click
20
- end
21
-
1
+ class LoginCommand
2
+
3
+ def initialize(username, password, navigator)
4
+ @username = username
5
+ @password = password
6
+ @navigator = navigator
7
+ end
8
+
9
+ def execute(b)
10
+ @navigator.goto 'hd/index.ssp', b
11
+ login b
12
+ end
13
+
14
+ private
15
+
16
+ def login(b)
17
+ b.text_field(name: 'user_id').set @username
18
+ b.text_field(name: 'user_pwd').set @password
19
+ b.button(value: 'Log on').click
20
+ end
21
+
22
22
  end
@@ -1,12 +1,12 @@
1
- class Navigator
2
-
3
- def initialize(base_url)
4
- @base_url = base_url
5
- end
6
-
7
- def goto(relative_url, b)
8
- url = File.join @base_url, relative_url
9
- b.goto url
10
- end
11
-
1
+ class Navigator
2
+
3
+ def initialize(base_url)
4
+ @base_url = base_url
5
+ end
6
+
7
+ def goto(relative_url, b)
8
+ url = File.join @base_url, relative_url
9
+ b.goto url
10
+ end
11
+
12
12
  end
@@ -1,45 +1,45 @@
1
- require 'thread'
2
- require 'watir-webdriver'
3
- require_relative 'request_page'
4
-
5
- class ParallelRequestScraper
6
-
7
- def initialize(navigator, login_command, threads)
8
- @navigator = navigator
9
- @login_command = login_command
10
- @threads = threads
11
- end
12
-
13
- def scrape(request_ids, &block)
14
- @request_ids = request_ids.clone
15
- @semaphore = Mutex.new
16
- @threads.times.map { Thread.new { scrape_requests &block } }.each { |t| t.join }
17
- end
18
-
19
- private
20
-
21
- def scrape_requests
22
- b = Watir::Browser.new
23
- @login_command.execute b
24
-
25
- until @request_ids.empty?
26
- request = scrape_next_request b
27
- request ? yield(request) : break
28
- end
29
-
30
- b.close
31
- end
32
-
33
- def scrape_next_request(b)
34
- id = next_request_id
35
- return unless id
36
- @navigator.goto "hd/ticket/euTicketView.ssp?ticket_id=#{id}&log=show", b
37
- request_page = RequestPage.new b
38
- request_page.get_request
39
- end
40
-
41
- def next_request_id
42
- @semaphore.synchronize { return @request_ids.pop }
43
- end
44
-
1
+ require 'thread'
2
+ require 'watir-webdriver'
3
+ require_relative 'request_page'
4
+
5
+ class ParallelRequestScraper
6
+
7
+ def initialize(navigator, login_command, threads)
8
+ @navigator = navigator
9
+ @login_command = login_command
10
+ @threads = threads
11
+ end
12
+
13
+ def scrape(request_ids, &block)
14
+ @request_ids = request_ids.clone
15
+ @semaphore = Mutex.new
16
+ @threads.times.map { Thread.new { scrape_requests &block } }.each { |t| t.join }
17
+ end
18
+
19
+ private
20
+
21
+ def scrape_requests
22
+ b = Watir::Browser.new
23
+ @login_command.execute b
24
+
25
+ until @request_ids.empty?
26
+ request = scrape_next_request b
27
+ request ? yield(request) : break
28
+ end
29
+
30
+ b.close
31
+ end
32
+
33
+ def scrape_next_request(b)
34
+ id = next_request_id
35
+ return unless id
36
+ @navigator.goto "hd/ticket/euTicketView.ssp?ticket_id=#{id}&log=show", b
37
+ request_page = RequestPage.new b
38
+ request_page.get_request
39
+ end
40
+
41
+ def next_request_id
42
+ @semaphore.synchronize { return @request_ids.pop }
43
+ end
44
+
45
45
  end
@@ -1,31 +1,46 @@
1
- require 'watir-webdriver'
2
-
3
- class RequestFinder
4
-
5
- def initialize(navigator)
6
- @navigator = navigator
7
- end
8
-
9
- def get_request_ids_for_users(users)
10
- b = Watir::Browser.new
11
- request_ids = users.map { |user| get_request_ids_for_user user, b }.flatten.sort
12
- b.close
13
- request_ids
14
- end
15
-
16
- private
17
-
18
- def get_request_ids_for_user(user, b)
19
- login_command = LoginCommand.new user[:username], user[:password], @navigator
20
- login_command.execute b
21
- @navigator.goto 'hd/ticket/euTicketFind.ssp', b
22
- get_all_request_ids b
23
- end
24
-
25
- def get_all_request_ids(b)
26
- b.button(value: 'Find').click
27
- b.links(href: /ticket_id=/).to_a.map { |link| link.href[/ticket_id=(\d+)/, 1].to_i }
28
- end
29
-
30
- end
31
-
1
+ require 'time'
2
+ require 'watir-webdriver'
3
+
4
+ class RequestFinder
5
+
6
+ def initialize(navigator)
7
+ @navigator = navigator
8
+ end
9
+
10
+ def get_request_ids_for_users(users, options={})
11
+ b = Watir::Browser.new
12
+ requests = users.map { |user| get_requests_for_user user, b }.flatten
13
+ b.close
14
+ start_date = options[:start_date]
15
+ requests.reject! { |r| start_date > r[:submitted_on] } if start_date
16
+ requests.map { |r| r[:id] }.sort
17
+ end
18
+
19
+ private
20
+
21
+ def get_requests_for_user(user, b)
22
+ login_command = LoginCommand.new user[:username], user[:password], @navigator
23
+ login_command.execute b
24
+ @navigator.goto 'hd/ticket/euTicketFind.ssp', b
25
+ get_all_requests b
26
+ end
27
+
28
+ def get_all_requests(b)
29
+ b.button(value: 'Find').click
30
+ rows = b.tables[1].rows.to_a
31
+ rows.shift 2
32
+
33
+ requests = rows.map do |row|
34
+ cells = row.cells.to_a
35
+ next if cells.empty?
36
+ {
37
+ id: cells[0].text.to_i,
38
+ submitted_on: Time.parse(cells[2].text)
39
+ }
40
+ end
41
+
42
+ requests.compact
43
+ end
44
+
45
+ end
46
+
@@ -1,25 +1,25 @@
1
- require 'time'
2
-
3
- class RequestHistoryTable
4
-
5
- def initialize(table)
6
- @table = table
7
- end
8
-
9
- def get_resolution_info
10
- r = {}
11
- rows = @table.trs.to_a
12
- resolved_row_index = rows.index { |r| r.text =~ /Resolved by/ }
13
-
14
- if resolved_row_index
15
- row_above = rows[resolved_row_index-1]
16
- captures = row_above.text.scan(/(.+) by (.+)/)[0]
17
- r[:resolved_on] = Time.parse captures[0]
18
- r[:resolved_by] = captures[1]
19
- end
20
-
21
- r[:history] = @table.text
22
- r
23
- end
24
-
1
+ require 'time'
2
+
3
+ class RequestHistoryTable
4
+
5
+ def initialize(table)
6
+ @table = table
7
+ end
8
+
9
+ def get_resolution_info
10
+ r = {}
11
+ rows = @table.trs.to_a
12
+ resolved_row_index = rows.index { |r| r.text =~ /Resolved by/ }
13
+
14
+ if resolved_row_index
15
+ row_above = rows[resolved_row_index-1]
16
+ captures = row_above.text.scan(/(.+) by (.+)/)[0]
17
+ r[:resolved_on] = Time.parse captures[0]
18
+ r[:resolved_by] = captures[1]
19
+ end
20
+
21
+ r[:history] = @table.text
22
+ r
23
+ end
24
+
25
25
  end
@@ -1,43 +1,43 @@
1
- class RequestInfoTable
2
-
3
- def initialize(table)
4
- @cells = table.tds.to_a
5
- end
6
-
7
- def get_request_info
8
- {
9
- title: cell(0),
10
- status: cell(2),
11
- service: cell(3),
12
- request_type: cell(4),
13
- time_spent: cell(5),
14
- priority: cell(7),
15
- deadline: time_cell(8),
16
- submitted_to: cell(10),
17
- submitted_by: cell(11),
18
- submitted_on: time_cell(12),
19
- assigned_to: cell(15),
20
- assigned_by: cell(16),
21
- assigned_on: time_cell(17),
22
- department_id: cell(19),
23
- closed_by: cell(20),
24
- closed_on: maybe_time_cell(21, 'None')
25
- }
26
- end
27
-
28
- private
29
-
30
- def maybe_time_cell(index, non_time_value)
31
- cell(index) { |s| s == non_time_value ? cell(index) : time_cell(index) }
32
- end
33
-
34
- def time_cell(index)
35
- cell(index) { |s| Time.parse s }
36
- end
37
-
38
- def cell(index)
39
- text = @cells[index].text
40
- block_given? ? yield(text) : text
41
- end
42
-
1
+ class RequestInfoTable
2
+
3
+ def initialize(table)
4
+ @cells = table.tds.to_a
5
+ end
6
+
7
+ def get_request_info
8
+ {
9
+ title: cell(0),
10
+ status: cell(2),
11
+ service: cell(3),
12
+ request_type: cell(4),
13
+ time_spent: cell(5),
14
+ priority: cell(7),
15
+ deadline: time_cell(8),
16
+ submitted_to: cell(10),
17
+ submitted_by: cell(11),
18
+ submitted_on: time_cell(12),
19
+ assigned_to: cell(15),
20
+ assigned_by: cell(16),
21
+ assigned_on: time_cell(17),
22
+ department_id: cell(19),
23
+ closed_by: cell(20),
24
+ closed_on: maybe_time_cell(21, 'None')
25
+ }
26
+ end
27
+
28
+ private
29
+
30
+ def maybe_time_cell(index, non_time_value)
31
+ cell(index) { |s| s == non_time_value ? cell(index) : time_cell(index) }
32
+ end
33
+
34
+ def time_cell(index)
35
+ cell(index) { |s| Time.parse s }
36
+ end
37
+
38
+ def cell(index)
39
+ text = @cells[index].text
40
+ block_given? ? yield(text) : text
41
+ end
42
+
43
43
  end
@@ -1,25 +1,25 @@
1
- require_relative 'request_info_table'
2
- require_relative 'request_history_table'
3
-
4
- class RequestPage
5
-
6
- def initialize(b)
7
- @b = b
8
- end
9
-
10
- def get_request
11
- request_info_table = RequestInfoTable.new @b.tables[1]
12
- request_history_table = RequestHistoryTable.new @b.tables[3]
13
- r = { id: request_id }
14
- r.merge! request_info_table.get_request_info
15
- r.merge! request_history_table.get_resolution_info
16
- r
17
- end
18
-
19
- private
20
-
21
- def request_id
22
- @b.text[/Request #(\d+)/, 1]
23
- end
24
-
1
+ require_relative 'request_info_table'
2
+ require_relative 'request_history_table'
3
+
4
+ class RequestPage
5
+
6
+ def initialize(b)
7
+ @b = b
8
+ end
9
+
10
+ def get_request
11
+ request_info_table = RequestInfoTable.new @b.tables[1]
12
+ request_history_table = RequestHistoryTable.new @b.tables[3]
13
+ r = { id: request_id }
14
+ r.merge! request_info_table.get_request_info
15
+ r.merge! request_history_table.get_resolution_info
16
+ r
17
+ end
18
+
19
+ private
20
+
21
+ def request_id
22
+ @b.text[/Request #(\d+)/, 1]
23
+ end
24
+
25
25
  end
@@ -1,51 +1,51 @@
1
- $stdout.sync = true
2
-
3
- require 'fileutils'
4
- require 'json'
5
- require_relative 'lib/login_command'
6
- require_relative 'lib/navigator'
7
- require_relative 'lib/parallel_request_scraper'
8
- require_relative 'lib/request_finder'
9
-
10
- class TrackIt
11
-
12
- def initialize(base_url, username, password, options={})
13
- navigator = Navigator.new base_url
14
- login_command = LoginCommand.new username, password, navigator
15
- threads = options[:threads] || 1
16
- @parallel_request_scraper = ParallelRequestScraper.new navigator, login_command, threads
17
- @request_finder = RequestFinder.new navigator
18
- @output_dir = options[:output_dir] || default_output_dir
19
- end
20
-
21
- def scrape_requests_for_users(users)
22
- request_ids = @request_finder.get_request_ids_for_users users
23
- scrape_requests request_ids
24
- end
25
-
26
- def scrape_requests_in_range(from_id, to_id)
27
- request_ids = (from_id..to_id).to_a
28
- scrape_requests request_ids
29
- end
30
-
31
- private
32
-
33
- def scrape_requests(request_ids)
34
- FileUtils.mkdir_p @output_dir
35
- @parallel_request_scraper.scrape(request_ids) { |request| write_request_file request }
36
- end
37
-
38
- def write_request_file(request)
39
- File.write request_file_path(request), request.to_json
40
- print '.'
41
- end
42
-
43
- def request_file_path(request)
44
- File.join @output_dir, "#{request[:id]}.json"
45
- end
46
-
47
- def default_output_dir
48
- File.join 'output', Time.now.strftime('%Y-%m-%d-%H-%M')
49
- end
50
-
1
+ $stdout.sync = true
2
+
3
+ require 'fileutils'
4
+ require 'json'
5
+ require_relative 'lib/login_command'
6
+ require_relative 'lib/navigator'
7
+ require_relative 'lib/parallel_request_scraper'
8
+ require_relative 'lib/request_finder'
9
+
10
+ class TrackIt
11
+
12
+ def initialize(base_url, username, password, options={})
13
+ navigator = Navigator.new base_url
14
+ login_command = LoginCommand.new username, password, navigator
15
+ threads = options[:threads] || 1
16
+ @parallel_request_scraper = ParallelRequestScraper.new navigator, login_command, threads
17
+ @request_finder = RequestFinder.new navigator
18
+ @output_dir = options[:output_dir] || default_output_dir
19
+ end
20
+
21
+ def scrape_requests_for_users(users, options={})
22
+ request_ids = @request_finder.get_request_ids_for_users users, options
23
+ scrape_requests request_ids
24
+ end
25
+
26
+ def scrape_requests_in_range(from_id, to_id)
27
+ request_ids = (from_id..to_id).to_a
28
+ scrape_requests request_ids
29
+ end
30
+
31
+ private
32
+
33
+ def scrape_requests(request_ids)
34
+ FileUtils.mkdir_p @output_dir
35
+ @parallel_request_scraper.scrape(request_ids) { |request| write_request_file request }
36
+ end
37
+
38
+ def write_request_file(request)
39
+ File.write request_file_path(request), request.to_json
40
+ print '.'
41
+ end
42
+
43
+ def request_file_path(request)
44
+ File.join @output_dir, "#{request[:id]}.json"
45
+ end
46
+
47
+ def default_output_dir
48
+ File.join 'output', Time.now.strftime('%Y-%m-%d-%H-%M')
49
+ end
50
+
51
51
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: trackit_scraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.1.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-07-06 00:00:00.000000000 Z
12
+ date: 2013-07-11 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: json
16
- requirement: &70292605876980 !ruby/object:Gem::Requirement
16
+ requirement: !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
@@ -21,10 +21,15 @@ dependencies:
21
21
  version: 1.7.7
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70292605876980
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ~>
28
+ - !ruby/object:Gem::Version
29
+ version: 1.7.7
25
30
  - !ruby/object:Gem::Dependency
26
31
  name: watir-webdriver
27
- requirement: &70292605876500 !ruby/object:Gem::Requirement
32
+ requirement: !ruby/object:Gem::Requirement
28
33
  none: false
29
34
  requirements:
30
35
  - - ~>
@@ -32,7 +37,12 @@ dependencies:
32
37
  version: 0.6.4
33
38
  type: :runtime
34
39
  prerelease: false
35
- version_requirements: *70292605876500
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ~>
44
+ - !ruby/object:Gem::Version
45
+ version: 0.6.4
36
46
  description:
37
47
  email: matthew-github@matthewriley.name
38
48
  executables: []
@@ -68,7 +78,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
68
78
  version: '0'
69
79
  requirements: []
70
80
  rubyforge_project:
71
- rubygems_version: 1.8.15
81
+ rubygems_version: 1.8.24
72
82
  signing_key:
73
83
  specification_version: 3
74
84
  summary: Screen scrapes data from the Track-It help desk web application.