rubyscraper 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 6f4cfba8b1442632b6c54f30886d254ff25fbbd3
4
- data.tar.gz: 4bc8431e8294d5900819d29735f69650e93448da
3
+ metadata.gz: e1b7f2d272df18fbf97f0da1c6d655077e86bf06
4
+ data.tar.gz: d7d09ff13581907abab68aafce6e15b0843bea58
5
5
  SHA512:
6
- metadata.gz: 73ff93065f3079602dfcc58e35e08686ce4fff2d693b7f80c63e8cd37f011826156731b7d8a42a84472232c4a0c5b18dcb8efde1e7a7ede2ded37c431d22f95f
7
- data.tar.gz: 04e30fe957b8d95a25cb0620539eca3b8956eb1c5638c8bd038aa6b461f4868555889b033201b350dc651f5faff7e2e38c2fac86ea98ce4bf13ac76de21834c5
6
+ metadata.gz: 1f06d2c6e6e91658a90d44fd3bc4ff55b8a2bf99116d016bf4d7afbe2cefc9b6f911f2b7a0bcfc31665aba3d4d5d962eb66e36be264eaf5a18434e39a461ea89
7
+ data.tar.gz: ad976858fee74b80497619622d76349a34ad586ace4aaa4657134d1fe7f38fabe82ec46d514693ebc7c7b75b3c704a58cb296deca4b3f516b6ab6a462c7e358d
data/Gemfile.lock CHANGED
@@ -16,10 +16,13 @@ GEM
16
16
  rack-test (>= 0.5.4)
17
17
  xpath (~> 2.0)
18
18
  cliver (0.3.2)
19
+ coderay (1.1.0)
20
+ diff-lcs (1.2.5)
19
21
  domain_name (0.5.24)
20
22
  unf (>= 0.0.5, < 1.0.0)
21
23
  http-cookie (1.0.2)
22
24
  domain_name (~> 0.5)
25
+ method_source (0.8.2)
23
26
  mime-types (2.4.3)
24
27
  mini_portile (0.6.2)
25
28
  multi_json (1.11.0)
@@ -31,6 +34,10 @@ GEM
31
34
  cliver (~> 0.3.1)
32
35
  multi_json (~> 1.0)
33
36
  websocket-driver (>= 0.2.0)
37
+ pry (0.10.1)
38
+ coderay (~> 1.1.0)
39
+ method_source (~> 0.8.1)
40
+ slop (~> 3.4)
34
41
  rack (1.6.0)
35
42
  rack-test (0.6.3)
36
43
  rack (>= 1.0)
@@ -39,6 +46,20 @@ GEM
39
46
  http-cookie (>= 1.0.2, < 2.0)
40
47
  mime-types (>= 1.16, < 3.0)
41
48
  netrc (~> 0.7)
49
+ rspec (3.2.0)
50
+ rspec-core (~> 3.2.0)
51
+ rspec-expectations (~> 3.2.0)
52
+ rspec-mocks (~> 3.2.0)
53
+ rspec-core (3.2.3)
54
+ rspec-support (~> 3.2.0)
55
+ rspec-expectations (3.2.1)
56
+ diff-lcs (>= 1.2.0, < 2.0)
57
+ rspec-support (~> 3.2.0)
58
+ rspec-mocks (3.2.1)
59
+ diff-lcs (>= 1.2.0, < 2.0)
60
+ rspec-support (~> 3.2.0)
61
+ rspec-support (3.2.2)
62
+ slop (3.6.0)
42
63
  unf (0.1.4)
43
64
  unf_ext
44
65
  unf_ext (0.0.7.1)
@@ -53,5 +74,7 @@ PLATFORMS
53
74
 
54
75
  DEPENDENCIES
55
76
  bundler (~> 1.9)
77
+ pry
56
78
  rake (~> 10.0)
79
+ rspec (~> 3.0)
57
80
  rubyscraper!
@@ -0,0 +1,287 @@
1
+ [
2
+ {
3
+ "name":"stackoverflow",
4
+ "base_url":"http://www.careers.stackoverflow.com",
5
+ "summary":{
6
+ "url":"/jobs?searchTerm=SEARCHTERM&sort=p",
7
+ "pagination_fmt":"&pg=",
8
+ "pagination_start":"1",
9
+ "pagination_scale":"1",
10
+ "params":[
11
+ {
12
+ "SEARCHTERM":[
13
+ "ruby",
14
+ "ruby+on+rails",
15
+ "javascript"
16
+ ]
17
+ }
18
+ ],
19
+ "loop":".listResults .-item",
20
+ "fields":[
21
+ {
22
+ "field":"position",
23
+ "method":"find",
24
+ "path":"h3.-title a"
25
+ },
26
+ {
27
+ "field":"url",
28
+ "method":"find",
29
+ "path":"h3.-title a",
30
+ "attr":"href"
31
+ },
32
+ {
33
+ "field":"posting_date",
34
+ "method":"first",
35
+ "path":"p._muted"
36
+ }
37
+ ]
38
+ },
39
+ "sub_page":{
40
+ "fields":[
41
+ {
42
+ "field":"company",
43
+ "method":"find",
44
+ "path":"a.employer"
45
+ },
46
+ {
47
+ "field":"location",
48
+ "method":"find",
49
+ "path":"span.location"
50
+ },
51
+ {
52
+ "field":"description",
53
+ "method":"all",
54
+ "path":"div.description p",
55
+ "loop_collect":"text",
56
+ "join":"\n"
57
+ },
58
+ {
59
+ "field":"tags",
60
+ "method":"all",
61
+ "path":"div.tags a.post-tag",
62
+ "loop_collect":"text",
63
+ "join":", "
64
+ }
65
+ ]
66
+ }
67
+ },
68
+ {
69
+ "name":"rubynow",
70
+ "base_url":"http://jobs.rubynow.com/",
71
+ "summary":{
72
+ "url":"",
73
+ "no_pagination?":"true",
74
+ "pagination_fmt":"",
75
+ "pagination_start":"",
76
+ "pagination_scale":"",
77
+ "params":[
78
+ ],
79
+ "loop":"ul.jobs li",
80
+ "fields":[
81
+ {
82
+ "field":"position",
83
+ "method":"find",
84
+ "path":"h3 a"
85
+ },
86
+ {
87
+ "field":"url",
88
+ "method":"find",
89
+ "path":"h3 a",
90
+ "attr":"href"
91
+ },
92
+ {
93
+ "field":"posting_date",
94
+ "method":"find",
95
+ "path":"span.date"
96
+ }
97
+ ]
98
+ },
99
+ "sub_page":{
100
+ "fields":[
101
+ {
102
+ "field":"description",
103
+ "method":"all",
104
+ "path":"div#info p",
105
+ "loop_collect":"text",
106
+ "join":"\n"
107
+ },
108
+ {
109
+ "field":"company",
110
+ "method":"find",
111
+ "path":"h2#headline a"
112
+ },
113
+ {
114
+ "field":"location",
115
+ "method":"find",
116
+ "path":"h3#location"
117
+ }
118
+ ]
119
+ }
120
+ },
121
+ {
122
+ "name":"weworkremotely",
123
+ "base_url":"https://weworkremotely.com",
124
+ "summary":{
125
+ "url":"/categories/2/jobs",
126
+ "no_pagination?":"true",
127
+ "pagination_fmt":"",
128
+ "pagination_start":"",
129
+ "pagination_scale":"",
130
+ "params":[
131
+ ],
132
+ "loop":"section.jobs ul li",
133
+ "fields":[
134
+ {
135
+ "field":"position",
136
+ "method":"find",
137
+ "path":"span.title"
138
+ },
139
+ {
140
+ "field":"company",
141
+ "method":"find",
142
+ "path":"span.company"
143
+ },
144
+ {
145
+ "field":"url",
146
+ "method":"find",
147
+ "path":"a",
148
+ "attr":"href"
149
+ },
150
+ {
151
+ "field":"posting_date",
152
+ "method":"find",
153
+ "path":"span.date"
154
+ }
155
+ ]
156
+ },
157
+ "sub_page":{
158
+ "fields":[
159
+ {
160
+ "field":"location",
161
+ "method":"find",
162
+ "path":"span.location"
163
+ },
164
+ {
165
+ "field":"description",
166
+ "method":"all",
167
+ "path":"div.listing-container div",
168
+ "loop_collect":"text",
169
+ "join":"\n"
170
+ }
171
+ ]
172
+ }
173
+ },
174
+ {
175
+ "name":"indeed",
176
+ "skip":"true",
177
+ "base_url":"http://www.indeed.com",
178
+ "summary":{
179
+ "url":"/jobs?q=SEARCHTERM&sr=directhire",
180
+ "pagination_fmt":"&start=",
181
+ "pagination_start":"0",
182
+ "pagination_scale":"10",
183
+ "params":[
184
+ {
185
+ "SEARCHTERM":[
186
+ "ruby",
187
+ "ruby+on+rails",
188
+ "junior+web+developer",
189
+ "ember.js",
190
+ "full+stack"
191
+ ]
192
+ }
193
+ ],
194
+ "loop":"div.row.result",
195
+ "fields":[
196
+ {
197
+ "field":"position",
198
+ "method":"find",
199
+ "path":"h2.jobtitle a"
200
+ },
201
+ {
202
+ "field":"url",
203
+ "method":"find",
204
+ "path":"h2.jobtitle a",
205
+ "attr":"href"
206
+ },
207
+ {
208
+ "field":"company",
209
+ "method":"find",
210
+ "path":"span.company span"
211
+ },
212
+ {
213
+ "field":"location",
214
+ "method":"find",
215
+ "path":"span.location span"
216
+ },
217
+ {
218
+ "field":"description",
219
+ "method":"find",
220
+ "path":"span.summary span"
221
+ },
222
+ {
223
+ "field":"posting_date",
224
+ "method":"find",
225
+ "path":"span.date"
226
+ }
227
+ ]
228
+ },
229
+ "sub_page":{
230
+ "fields":[
231
+ ]
232
+ }
233
+ },
234
+ {
235
+ "name":"linkedin",
236
+ "skip":"true",
237
+ "base_url":"https://www.linkedin.com",
238
+ "summary":{
239
+ "url":"/vsearch/j?keywords=SEARCHTERM&openAdvancedForm=true&locationType=I&countryCode=us&rsid=754744171429892349899&orig=FCTD&openFacets=L,C,TP&f_TP=1&pt=jobs&pt=jobs",
240
+ "pagination_fmt":"&page_num=",
241
+ "pagination_start":"1",
242
+ "pagination_scale":"1",
243
+ "params":[
244
+ {
245
+ "SEARCHTERM":[
246
+ "Ruby",
247
+ "Ruby+On+Rails",
248
+ "javascript"
249
+ ]
250
+ }
251
+ ],
252
+ "loop":"ol.search-results li.result",
253
+ "fields":[
254
+ {
255
+ "field":"position",
256
+ "method":"find",
257
+ "path":"a.title"
258
+ },
259
+ {
260
+ "field":"url",
261
+ "method":"find",
262
+ "path":"a.title",
263
+ "attr":"href"
264
+ },
265
+ {
266
+ "field":"company",
267
+ "method":"find",
268
+ "path":"div.description a"
269
+ },
270
+ {
271
+ "field":"location",
272
+ "method":"find",
273
+ "path":"dl.demographic bdi"
274
+ }
275
+ ]
276
+ },
277
+ "sub_page":{
278
+ "fields":[
279
+ {
280
+ "field":"description",
281
+ "method":"find",
282
+ "path":"div.description-section div.rich-text"
283
+ }
284
+ ]
285
+ }
286
+ }
287
+ ]
data/lib/rubyscraper.rb CHANGED
@@ -5,76 +5,135 @@ require 'rubyscraper/version'
5
5
 
6
6
  class RubyScraper
7
7
  include Capybara::DSL
8
+ attr_reader :scrape_config, :pages, :jobs, :posted_jobs, :endpoint, :scraped_jobs
8
9
 
9
- def initialize(endpoint)
10
+ def initialize(endpoint, pages=1)
10
11
  Capybara.register_driver :poltergeist do |app|
11
12
  Capybara::Poltergeist::Driver.new(app, js_errors: false)
12
13
  end
13
14
  Capybara.default_driver = :poltergeist
15
+
14
16
  @jobs = []
17
+ @scraped_jobs = 0
15
18
  @posted_jobs = 0
19
+ @pages = pages
16
20
  @endpoint = endpoint
17
- @search_terms_file = File.expand_path('../assets/search-terms.txt', __FILE__)
18
- @search_terms = []
19
- File.foreach(@search_terms_file) { |x| @search_terms << x.strip }
21
+ @scrape_file = File.expand_path('../assets/scrapes.json', __FILE__)
22
+ @scrape_config = JSON.parse(File.read(@scrape_file))
20
23
  end
21
24
 
22
- def scrape
23
- get_summaries
24
- get_bodies
25
- send_to_server
26
- return @jobs.length, @posted_jobs
25
+ def scrape(single_site=nil)
26
+ if single_site
27
+ search_site = scrape_config.select { |site| site["name"] == single_site }
28
+ if search_site
29
+ get_data(search_site.first)
30
+ else
31
+ raise "Invalid single site name #{single_site}. Not in scrape file."
32
+ end
33
+ else
34
+ scrape_config.each do |site|
35
+ unless site["skip"] == "true"
36
+ get_data(site)
37
+ end
38
+ end
39
+ end
40
+ return scraped_jobs, posted_jobs
27
41
  end
28
42
 
29
- def get_summaries
30
- @search_terms.each do |term|
31
- visit "http://careers.stackoverflow.com/jobs?searchTerm=#{term}&sort=p"
32
- (1..2).to_a.each do |page|
33
- visit "http://careers.stackoverflow.com/jobs?searchTerm=ruby&sort=p&pg=#{page}"
34
- all(".listResults .-item").each do |listing|
35
- position = listing.find("h3.-title a").text
36
- url = listing.find("h3.-title a")["href"]
37
- posting_date = listing.first("p._muted").text
43
+ def get_data(site)
44
+ get_summaries(site)
45
+ get_bodies(site)
46
+ send_to_server
47
+ end
38
48
 
39
- @jobs << { position: position, url: url, posting_date: posting_date }
49
+ def get_summaries(site)
50
+ if site["summary"]["params"].length > 0 && !site["summary"]["no_pagination?"]
51
+ site["summary"]["params"][0]["SEARCHTERM"].each do |term|
52
+ summary_url = "#{site["base_url"]}#{site["summary"]["url"].sub("SEARCHTERM", term)}"
53
+ pagination_start = site["summary"]["pagination_start"].to_i
54
+ pagination_end = pagination_start + pages - 1
55
+ (pagination_start..pagination_end).to_a.each do |page|
56
+ visit "#{summary_url}#{site["summary"]["pagination_fmt"]}#{page * site["summary"]["pagination_scale"].to_i}"
57
+ all(site["summary"]["loop"]).each do |listing|
58
+ job = pull_summary_data(site, listing)
59
+ job = modify_data(site, job)
60
+ jobs << job
61
+ end
62
+ puts "Pulled #{site["name"]}: #{term} (page: #{page}) job summaries."
40
63
  end
41
64
  end
42
- puts "Pulled #{term} job summaries."
65
+ else
66
+ summary_url = "#{site["base_url"]}#{site["summary"]["url"]}"
67
+ visit summary_url
68
+ all(site["summary"]["loop"]).each do |listing|
69
+ job = pull_summary_data(site, listing)
70
+ job = modify_data(site, job)
71
+ jobs << job
72
+ end
73
+ puts "Pulled #{site["name"]} job summaries."
43
74
  end
44
75
  end
45
76
 
46
- def get_bodies
47
- @jobs.each_with_index do |job, i|
48
- puts "Job #{i+1} pulled."
49
- sleep 1
50
- visit "http://careers.stackoverflow.com#{job[:url]}"
51
- if has_css?("a.employer")
52
- job[:company] = find("a.employer").text
53
- end
54
- if has_css?("span.location")
55
- job[:location] = find("span.location").text
56
- end
57
- #job[:description] = first("div.description p")
58
- description = all("div.description p").map do |p|
59
- p.text
77
+ def pull_summary_data(site, listing)
78
+ job = Hash.new
79
+ site["summary"]["fields"].each do |field|
80
+ if field["attr"]
81
+ if listing.has_css?(field["path"])
82
+ job[field["field"]] =
83
+ listing.send(field["method"].to_sym, field["path"])[field["attr"]]
84
+ end
85
+ else
86
+ if listing.has_css?(field["path"])
87
+ job[field["field"]] =
88
+ listing.send(field["method"].to_sym, field["path"]).text
89
+ end
60
90
  end
61
- job[:description] = description.join("\n")
62
- tags = all("div.tags a.post-tag").map do |tag|
63
- tag.text
91
+ end; job
92
+ end
93
+
94
+ def modify_data(site, job)
95
+ job["url"] = "#{site["base_url"]}#{job["url"]}" unless job["url"].match(/^http/)
96
+ job
97
+ end
98
+
99
+ def get_bodies(site)
100
+ jobs.each_with_index do |job, i|
101
+ sleep 1
102
+ pull_job_data(site, job)
103
+ puts "Job #{i+1} pulled."
104
+ end
105
+ end
106
+
107
+ def pull_job_data(site, job)
108
+ visit job["url"]
109
+ site["sub_page"]["fields"].each do |field|
110
+ if field["method"] == "all"
111
+ if has_css?(field["path"])
112
+ values = all(field["path"]).map do |elem|
113
+ elem.send(field["loop_collect"])
114
+ end
115
+ job[field["field"]] = values.join(field["join"])
116
+ end
117
+ else
118
+ if has_css?(field["path"])
119
+ job[field["field"]] =
120
+ send(field["method"].to_sym,field["path"]).text
121
+ end
64
122
  end
65
- job[:tags] = tags
66
123
  end
67
124
  end
68
125
 
69
126
  def send_to_server
70
- @jobs.each_with_index do |job, i|
127
+ @scraped_jobs += jobs.length
128
+ jobs.each do |job|
71
129
  new_job = {
72
- position: job[:position],
73
- location: job[:location],
74
- description: job[:description],
75
- source: "http://careers.stackoverflow.com#{job[:url]}"
130
+ position: job["position"],
131
+ location: job["location"],
132
+ description: job["description"],
133
+ source: job["url"]
76
134
  }
77
- RestClient.post(@endpoint, job: new_job){ |response, request, result, &block|
135
+
136
+ RestClient.post(endpoint, job: new_job){ |response, request, result, &block|
78
137
  case response.code
79
138
  when 201
80
139
  @posted_jobs += 1
@@ -86,5 +145,6 @@ class RubyScraper
86
145
  end
87
146
  }
88
147
  end
148
+ @jobs = []
89
149
  end
90
150
  end
@@ -6,9 +6,10 @@ class RubyScraper
6
6
  outstream.puts "StackOverflow Job Scraper"
7
7
  outstream.puts "---------------------------------------------"
8
8
  outstream.puts "Started scraping..."
9
- endpoint = argv.first
9
+ endpoint = argv[0]
10
+ single_site = argv[1]
10
11
  outstream.puts "Sending post requests to #{endpoint}"
11
- jobs_scraped, jobs_saved = RubyScraper.new(endpoint).scrape
12
+ jobs_scraped, jobs_saved = RubyScraper.new(endpoint).scrape(single_site)
12
13
  outstream.puts "Scraped #{jobs_scraped} jobs, succesfully posted #{jobs_saved} jobs."
13
14
  outstream.puts "---------------------------------------------"
14
15
  outstream.puts "Completed!"
@@ -1,3 +1,3 @@
1
1
  class RubyScraper
2
- VERSION = "0.1.0"
2
+ VERSION = "0.2.0"
3
3
  end
data/rubyscraper.gemspec CHANGED
@@ -16,6 +16,7 @@ Gem::Specification.new do |s|
16
16
  s.add_dependency "capybara"
17
17
  s.add_dependency "poltergeist"
18
18
  s.add_dependency "rest-client"
19
+ s.add_dependency "slop"
19
20
 
20
21
  s.add_development_dependency "bundler", "~> 1.9"
21
22
  s.add_development_dependency "rake", "~> 10.0"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rubyscraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nathan Owsiany
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-04-23 00:00:00.000000000 Z
11
+ date: 2015-04-26 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: capybara
@@ -52,6 +52,20 @@ dependencies:
52
52
  - - ">="
53
53
  - !ruby/object:Gem::Version
54
54
  version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: slop
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
55
69
  - !ruby/object:Gem::Dependency
56
70
  name: bundler
57
71
  requirement: !ruby/object:Gem::Requirement
@@ -122,7 +136,7 @@ files:
122
136
  - bin/console
123
137
  - bin/rubyscraper
124
138
  - bin/setup
125
- - lib/assets/search-terms.txt
139
+ - lib/assets/scrapes.json
126
140
  - lib/rubyscraper.rb
127
141
  - lib/rubyscraper/binary.rb
128
142
  - lib/rubyscraper/version.rb
@@ -1,5 +0,0 @@
1
- ruby
2
- ruby+on+rails
3
- javascript
4
- junior
5
- full-stack