rubyscraper 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 6f4cfba8b1442632b6c54f30886d254ff25fbbd3
4
- data.tar.gz: 4bc8431e8294d5900819d29735f69650e93448da
3
+ metadata.gz: e1b7f2d272df18fbf97f0da1c6d655077e86bf06
4
+ data.tar.gz: d7d09ff13581907abab68aafce6e15b0843bea58
5
5
  SHA512:
6
- metadata.gz: 73ff93065f3079602dfcc58e35e08686ce4fff2d693b7f80c63e8cd37f011826156731b7d8a42a84472232c4a0c5b18dcb8efde1e7a7ede2ded37c431d22f95f
7
- data.tar.gz: 04e30fe957b8d95a25cb0620539eca3b8956eb1c5638c8bd038aa6b461f4868555889b033201b350dc651f5faff7e2e38c2fac86ea98ce4bf13ac76de21834c5
6
+ metadata.gz: 1f06d2c6e6e91658a90d44fd3bc4ff55b8a2bf99116d016bf4d7afbe2cefc9b6f911f2b7a0bcfc31665aba3d4d5d962eb66e36be264eaf5a18434e39a461ea89
7
+ data.tar.gz: ad976858fee74b80497619622d76349a34ad586ace4aaa4657134d1fe7f38fabe82ec46d514693ebc7c7b75b3c704a58cb296deca4b3f516b6ab6a462c7e358d
data/Gemfile.lock CHANGED
@@ -16,10 +16,13 @@ GEM
16
16
  rack-test (>= 0.5.4)
17
17
  xpath (~> 2.0)
18
18
  cliver (0.3.2)
19
+ coderay (1.1.0)
20
+ diff-lcs (1.2.5)
19
21
  domain_name (0.5.24)
20
22
  unf (>= 0.0.5, < 1.0.0)
21
23
  http-cookie (1.0.2)
22
24
  domain_name (~> 0.5)
25
+ method_source (0.8.2)
23
26
  mime-types (2.4.3)
24
27
  mini_portile (0.6.2)
25
28
  multi_json (1.11.0)
@@ -31,6 +34,10 @@ GEM
31
34
  cliver (~> 0.3.1)
32
35
  multi_json (~> 1.0)
33
36
  websocket-driver (>= 0.2.0)
37
+ pry (0.10.1)
38
+ coderay (~> 1.1.0)
39
+ method_source (~> 0.8.1)
40
+ slop (~> 3.4)
34
41
  rack (1.6.0)
35
42
  rack-test (0.6.3)
36
43
  rack (>= 1.0)
@@ -39,6 +46,20 @@ GEM
39
46
  http-cookie (>= 1.0.2, < 2.0)
40
47
  mime-types (>= 1.16, < 3.0)
41
48
  netrc (~> 0.7)
49
+ rspec (3.2.0)
50
+ rspec-core (~> 3.2.0)
51
+ rspec-expectations (~> 3.2.0)
52
+ rspec-mocks (~> 3.2.0)
53
+ rspec-core (3.2.3)
54
+ rspec-support (~> 3.2.0)
55
+ rspec-expectations (3.2.1)
56
+ diff-lcs (>= 1.2.0, < 2.0)
57
+ rspec-support (~> 3.2.0)
58
+ rspec-mocks (3.2.1)
59
+ diff-lcs (>= 1.2.0, < 2.0)
60
+ rspec-support (~> 3.2.0)
61
+ rspec-support (3.2.2)
62
+ slop (3.6.0)
42
63
  unf (0.1.4)
43
64
  unf_ext
44
65
  unf_ext (0.0.7.1)
@@ -53,5 +74,7 @@ PLATFORMS
53
74
 
54
75
  DEPENDENCIES
55
76
  bundler (~> 1.9)
77
+ pry
56
78
  rake (~> 10.0)
79
+ rspec (~> 3.0)
57
80
  rubyscraper!
@@ -0,0 +1,287 @@
1
+ [
2
+ {
3
+ "name":"stackoverflow",
4
+ "base_url":"http://www.careers.stackoverflow.com",
5
+ "summary":{
6
+ "url":"/jobs?searchTerm=SEARCHTERM&sort=p",
7
+ "pagination_fmt":"&pg=",
8
+ "pagination_start":"1",
9
+ "pagination_scale":"1",
10
+ "params":[
11
+ {
12
+ "SEARCHTERM":[
13
+ "ruby",
14
+ "ruby+on+rails",
15
+ "javascript"
16
+ ]
17
+ }
18
+ ],
19
+ "loop":".listResults .-item",
20
+ "fields":[
21
+ {
22
+ "field":"position",
23
+ "method":"find",
24
+ "path":"h3.-title a"
25
+ },
26
+ {
27
+ "field":"url",
28
+ "method":"find",
29
+ "path":"h3.-title a",
30
+ "attr":"href"
31
+ },
32
+ {
33
+ "field":"posting_date",
34
+ "method":"first",
35
+ "path":"p._muted"
36
+ }
37
+ ]
38
+ },
39
+ "sub_page":{
40
+ "fields":[
41
+ {
42
+ "field":"company",
43
+ "method":"find",
44
+ "path":"a.employer"
45
+ },
46
+ {
47
+ "field":"location",
48
+ "method":"find",
49
+ "path":"span.location"
50
+ },
51
+ {
52
+ "field":"description",
53
+ "method":"all",
54
+ "path":"div.description p",
55
+ "loop_collect":"text",
56
+ "join":"\n"
57
+ },
58
+ {
59
+ "field":"tags",
60
+ "method":"all",
61
+ "path":"div.tags a.post-tag",
62
+ "loop_collect":"text",
63
+ "join":", "
64
+ }
65
+ ]
66
+ }
67
+ },
68
+ {
69
+ "name":"rubynow",
70
+ "base_url":"http://jobs.rubynow.com/",
71
+ "summary":{
72
+ "url":"",
73
+ "no_pagination?":"true",
74
+ "pagination_fmt":"",
75
+ "pagination_start":"",
76
+ "pagination_scale":"",
77
+ "params":[
78
+ ],
79
+ "loop":"ul.jobs li",
80
+ "fields":[
81
+ {
82
+ "field":"position",
83
+ "method":"find",
84
+ "path":"h3 a"
85
+ },
86
+ {
87
+ "field":"url",
88
+ "method":"find",
89
+ "path":"h3 a",
90
+ "attr":"href"
91
+ },
92
+ {
93
+ "field":"posting_date",
94
+ "method":"find",
95
+ "path":"span.date"
96
+ }
97
+ ]
98
+ },
99
+ "sub_page":{
100
+ "fields":[
101
+ {
102
+ "field":"description",
103
+ "method":"all",
104
+ "path":"div#info p",
105
+ "loop_collect":"text",
106
+ "join":"\n"
107
+ },
108
+ {
109
+ "field":"company",
110
+ "method":"find",
111
+ "path":"h2#headline a"
112
+ },
113
+ {
114
+ "field":"location",
115
+ "method":"find",
116
+ "path":"h3#location"
117
+ }
118
+ ]
119
+ }
120
+ },
121
+ {
122
+ "name":"weworkremotely",
123
+ "base_url":"https://weworkremotely.com",
124
+ "summary":{
125
+ "url":"/categories/2/jobs",
126
+ "no_pagination?":"true",
127
+ "pagination_fmt":"",
128
+ "pagination_start":"",
129
+ "pagination_scale":"",
130
+ "params":[
131
+ ],
132
+ "loop":"section.jobs ul li",
133
+ "fields":[
134
+ {
135
+ "field":"position",
136
+ "method":"find",
137
+ "path":"span.title"
138
+ },
139
+ {
140
+ "field":"company",
141
+ "method":"find",
142
+ "path":"span.company"
143
+ },
144
+ {
145
+ "field":"url",
146
+ "method":"find",
147
+ "path":"a",
148
+ "attr":"href"
149
+ },
150
+ {
151
+ "field":"posting_date",
152
+ "method":"find",
153
+ "path":"span.date"
154
+ }
155
+ ]
156
+ },
157
+ "sub_page":{
158
+ "fields":[
159
+ {
160
+ "field":"location",
161
+ "method":"find",
162
+ "path":"span.location"
163
+ },
164
+ {
165
+ "field":"description",
166
+ "method":"all",
167
+ "path":"div.listing-container div",
168
+ "loop_collect":"text",
169
+ "join":"\n"
170
+ }
171
+ ]
172
+ }
173
+ },
174
+ {
175
+ "name":"indeed",
176
+ "skip":"true",
177
+ "base_url":"http://www.indeed.com",
178
+ "summary":{
179
+ "url":"/jobs?q=SEARCHTERM&sr=directhire",
180
+ "pagination_fmt":"&start=",
181
+ "pagination_start":"0",
182
+ "pagination_scale":"10",
183
+ "params":[
184
+ {
185
+ "SEARCHTERM":[
186
+ "ruby",
187
+ "ruby+on+rails",
188
+ "junior+web+developer",
189
+ "ember.js",
190
+ "full+stack"
191
+ ]
192
+ }
193
+ ],
194
+ "loop":"div.row.result",
195
+ "fields":[
196
+ {
197
+ "field":"position",
198
+ "method":"find",
199
+ "path":"h2.jobtitle a"
200
+ },
201
+ {
202
+ "field":"url",
203
+ "method":"find",
204
+ "path":"h2.jobtitle a",
205
+ "attr":"href"
206
+ },
207
+ {
208
+ "field":"company",
209
+ "method":"find",
210
+ "path":"span.company span"
211
+ },
212
+ {
213
+ "field":"location",
214
+ "method":"find",
215
+ "path":"span.location span"
216
+ },
217
+ {
218
+ "field":"description",
219
+ "method":"find",
220
+ "path":"span.summary span"
221
+ },
222
+ {
223
+ "field":"posting_date",
224
+ "method":"find",
225
+ "path":"span.date"
226
+ }
227
+ ]
228
+ },
229
+ "sub_page":{
230
+ "fields":[
231
+ ]
232
+ }
233
+ },
234
+ {
235
+ "name":"linkedin",
236
+ "skip":"true",
237
+ "base_url":"https://www.linkedin.com",
238
+ "summary":{
239
+ "url":"/vsearch/j?keywords=SEARCHTERM&openAdvancedForm=true&locationType=I&countryCode=us&rsid=754744171429892349899&orig=FCTD&openFacets=L,C,TP&f_TP=1&pt=jobs&pt=jobs",
240
+ "pagination_fmt":"&page_num=",
241
+ "pagination_start":"1",
242
+ "pagination_scale":"1",
243
+ "params":[
244
+ {
245
+ "SEARCHTERM":[
246
+ "Ruby",
247
+ "Ruby+On+Rails",
248
+ "javascript"
249
+ ]
250
+ }
251
+ ],
252
+ "loop":"ol.search-results li.result",
253
+ "fields":[
254
+ {
255
+ "field":"position",
256
+ "method":"find",
257
+ "path":"a.title"
258
+ },
259
+ {
260
+ "field":"url",
261
+ "method":"find",
262
+ "path":"a.title",
263
+ "attr":"href"
264
+ },
265
+ {
266
+ "field":"company",
267
+ "method":"find",
268
+ "path":"div.description a"
269
+ },
270
+ {
271
+ "field":"location",
272
+ "method":"find",
273
+ "path":"dl.demographic bdi"
274
+ }
275
+ ]
276
+ },
277
+ "sub_page":{
278
+ "fields":[
279
+ {
280
+ "field":"description",
281
+ "method":"find",
282
+ "path":"div.description-section div.rich-text"
283
+ }
284
+ ]
285
+ }
286
+ }
287
+ ]
data/lib/rubyscraper.rb CHANGED
@@ -5,76 +5,135 @@ require 'rubyscraper/version'
5
5
 
6
6
  class RubyScraper
7
7
  include Capybara::DSL
8
+ attr_reader :scrape_config, :pages, :jobs, :posted_jobs, :endpoint, :scraped_jobs
8
9
 
9
- def initialize(endpoint)
10
+ def initialize(endpoint, pages=1)
10
11
  Capybara.register_driver :poltergeist do |app|
11
12
  Capybara::Poltergeist::Driver.new(app, js_errors: false)
12
13
  end
13
14
  Capybara.default_driver = :poltergeist
15
+
14
16
  @jobs = []
17
+ @scraped_jobs = 0
15
18
  @posted_jobs = 0
19
+ @pages = pages
16
20
  @endpoint = endpoint
17
- @search_terms_file = File.expand_path('../assets/search-terms.txt', __FILE__)
18
- @search_terms = []
19
- File.foreach(@search_terms_file) { |x| @search_terms << x.strip }
21
+ @scrape_file = File.expand_path('../assets/scrapes.json', __FILE__)
22
+ @scrape_config = JSON.parse(File.read(@scrape_file))
20
23
  end
21
24
 
22
- def scrape
23
- get_summaries
24
- get_bodies
25
- send_to_server
26
- return @jobs.length, @posted_jobs
25
+ def scrape(single_site=nil)
26
+ if single_site
27
+ search_site = scrape_config.select { |site| site["name"] == single_site }
28
+ if search_site
29
+ get_data(search_site.first)
30
+ else
31
+ raise "Invalid single site name #{single_site}. Not in scrape file."
32
+ end
33
+ else
34
+ scrape_config.each do |site|
35
+ unless site["skip"] == "true"
36
+ get_data(site)
37
+ end
38
+ end
39
+ end
40
+ return scraped_jobs, posted_jobs
27
41
  end
28
42
 
29
- def get_summaries
30
- @search_terms.each do |term|
31
- visit "http://careers.stackoverflow.com/jobs?searchTerm=#{term}&sort=p"
32
- (1..2).to_a.each do |page|
33
- visit "http://careers.stackoverflow.com/jobs?searchTerm=ruby&sort=p&pg=#{page}"
34
- all(".listResults .-item").each do |listing|
35
- position = listing.find("h3.-title a").text
36
- url = listing.find("h3.-title a")["href"]
37
- posting_date = listing.first("p._muted").text
43
+ def get_data(site)
44
+ get_summaries(site)
45
+ get_bodies(site)
46
+ send_to_server
47
+ end
38
48
 
39
- @jobs << { position: position, url: url, posting_date: posting_date }
49
+ def get_summaries(site)
50
+ if site["summary"]["params"].length > 0 && !site["summary"]["no_pagination?"]
51
+ site["summary"]["params"][0]["SEARCHTERM"].each do |term|
52
+ summary_url = "#{site["base_url"]}#{site["summary"]["url"].sub("SEARCHTERM", term)}"
53
+ pagination_start = site["summary"]["pagination_start"].to_i
54
+ pagination_end = pagination_start + pages - 1
55
+ (pagination_start..pagination_end).to_a.each do |page|
56
+ visit "#{summary_url}#{site["summary"]["pagination_fmt"]}#{page * site["summary"]["pagination_scale"].to_i}"
57
+ all(site["summary"]["loop"]).each do |listing|
58
+ job = pull_summary_data(site, listing)
59
+ job = modify_data(site, job)
60
+ jobs << job
61
+ end
62
+ puts "Pulled #{site["name"]}: #{term} (page: #{page}) job summaries."
40
63
  end
41
64
  end
42
- puts "Pulled #{term} job summaries."
65
+ else
66
+ summary_url = "#{site["base_url"]}#{site["summary"]["url"]}"
67
+ visit summary_url
68
+ all(site["summary"]["loop"]).each do |listing|
69
+ job = pull_summary_data(site, listing)
70
+ job = modify_data(site, job)
71
+ jobs << job
72
+ end
73
+ puts "Pulled #{site["name"]} job summaries."
43
74
  end
44
75
  end
45
76
 
46
- def get_bodies
47
- @jobs.each_with_index do |job, i|
48
- puts "Job #{i+1} pulled."
49
- sleep 1
50
- visit "http://careers.stackoverflow.com#{job[:url]}"
51
- if has_css?("a.employer")
52
- job[:company] = find("a.employer").text
53
- end
54
- if has_css?("span.location")
55
- job[:location] = find("span.location").text
56
- end
57
- #job[:description] = first("div.description p")
58
- description = all("div.description p").map do |p|
59
- p.text
77
+ def pull_summary_data(site, listing)
78
+ job = Hash.new
79
+ site["summary"]["fields"].each do |field|
80
+ if field["attr"]
81
+ if listing.has_css?(field["path"])
82
+ job[field["field"]] =
83
+ listing.send(field["method"].to_sym, field["path"])[field["attr"]]
84
+ end
85
+ else
86
+ if listing.has_css?(field["path"])
87
+ job[field["field"]] =
88
+ listing.send(field["method"].to_sym, field["path"]).text
89
+ end
60
90
  end
61
- job[:description] = description.join("\n")
62
- tags = all("div.tags a.post-tag").map do |tag|
63
- tag.text
91
+ end; job
92
+ end
93
+
94
+ def modify_data(site, job)
95
+ job["url"] = "#{site["base_url"]}#{job["url"]}" unless job["url"].match(/^http/)
96
+ job
97
+ end
98
+
99
+ def get_bodies(site)
100
+ jobs.each_with_index do |job, i|
101
+ sleep 1
102
+ pull_job_data(site, job)
103
+ puts "Job #{i+1} pulled."
104
+ end
105
+ end
106
+
107
+ def pull_job_data(site, job)
108
+ visit job["url"]
109
+ site["sub_page"]["fields"].each do |field|
110
+ if field["method"] == "all"
111
+ if has_css?(field["path"])
112
+ values = all(field["path"]).map do |elem|
113
+ elem.send(field["loop_collect"])
114
+ end
115
+ job[field["field"]] = values.join(field["join"])
116
+ end
117
+ else
118
+ if has_css?(field["path"])
119
+ job[field["field"]] =
120
+ send(field["method"].to_sym,field["path"]).text
121
+ end
64
122
  end
65
- job[:tags] = tags
66
123
  end
67
124
  end
68
125
 
69
126
  def send_to_server
70
- @jobs.each_with_index do |job, i|
127
+ @scraped_jobs += jobs.length
128
+ jobs.each do |job|
71
129
  new_job = {
72
- position: job[:position],
73
- location: job[:location],
74
- description: job[:description],
75
- source: "http://careers.stackoverflow.com#{job[:url]}"
130
+ position: job["position"],
131
+ location: job["location"],
132
+ description: job["description"],
133
+ source: job["url"]
76
134
  }
77
- RestClient.post(@endpoint, job: new_job){ |response, request, result, &block|
135
+
136
+ RestClient.post(endpoint, job: new_job){ |response, request, result, &block|
78
137
  case response.code
79
138
  when 201
80
139
  @posted_jobs += 1
@@ -86,5 +145,6 @@ class RubyScraper
86
145
  end
87
146
  }
88
147
  end
148
+ @jobs = []
89
149
  end
90
150
  end
@@ -6,9 +6,10 @@ class RubyScraper
6
6
  outstream.puts "StackOverflow Job Scraper"
7
7
  outstream.puts "---------------------------------------------"
8
8
  outstream.puts "Started scraping..."
9
- endpoint = argv.first
9
+ endpoint = argv[0]
10
+ single_site = argv[1]
10
11
  outstream.puts "Sending post requests to #{endpoint}"
11
- jobs_scraped, jobs_saved = RubyScraper.new(endpoint).scrape
12
+ jobs_scraped, jobs_saved = RubyScraper.new(endpoint).scrape(single_site)
12
13
  outstream.puts "Scraped #{jobs_scraped} jobs, succesfully posted #{jobs_saved} jobs."
13
14
  outstream.puts "---------------------------------------------"
14
15
  outstream.puts "Completed!"
@@ -1,3 +1,3 @@
1
1
  class RubyScraper
2
- VERSION = "0.1.0"
2
+ VERSION = "0.2.0"
3
3
  end
data/rubyscraper.gemspec CHANGED
@@ -16,6 +16,7 @@ Gem::Specification.new do |s|
16
16
  s.add_dependency "capybara"
17
17
  s.add_dependency "poltergeist"
18
18
  s.add_dependency "rest-client"
19
+ s.add_dependency "slop"
19
20
 
20
21
  s.add_development_dependency "bundler", "~> 1.9"
21
22
  s.add_development_dependency "rake", "~> 10.0"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rubyscraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nathan Owsiany
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-04-23 00:00:00.000000000 Z
11
+ date: 2015-04-26 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: capybara
@@ -52,6 +52,20 @@ dependencies:
52
52
  - - ">="
53
53
  - !ruby/object:Gem::Version
54
54
  version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: slop
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
55
69
  - !ruby/object:Gem::Dependency
56
70
  name: bundler
57
71
  requirement: !ruby/object:Gem::Requirement
@@ -122,7 +136,7 @@ files:
122
136
  - bin/console
123
137
  - bin/rubyscraper
124
138
  - bin/setup
125
- - lib/assets/search-terms.txt
139
+ - lib/assets/scrapes.json
126
140
  - lib/rubyscraper.rb
127
141
  - lib/rubyscraper/binary.rb
128
142
  - lib/rubyscraper/version.rb
@@ -1,5 +0,0 @@
1
- ruby
2
- ruby+on+rails
3
- javascript
4
- junior
5
- full-stack