jobdrop 2.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. jobdrop-2.0.0/LICENSE +21 -0
  2. jobdrop-2.0.0/PKG-INFO +210 -0
  3. jobdrop-2.0.0/README.md +177 -0
  4. jobdrop-2.0.0/jobdrop/__init__.py +256 -0
  5. jobdrop-2.0.0/jobdrop/_defaults.py +64 -0
  6. jobdrop-2.0.0/jobdrop/adzuna/__init__.py +238 -0
  7. jobdrop-2.0.0/jobdrop/bayt/__init__.py +145 -0
  8. jobdrop-2.0.0/jobdrop/bdjobs/__init__.py +353 -0
  9. jobdrop-2.0.0/jobdrop/bdjobs/constant.py +32 -0
  10. jobdrop-2.0.0/jobdrop/bdjobs/util.py +100 -0
  11. jobdrop-2.0.0/jobdrop/clearancejobs/__init__.py +426 -0
  12. jobdrop-2.0.0/jobdrop/collabwork/__init__.py +402 -0
  13. jobdrop-2.0.0/jobdrop/exception.py +45 -0
  14. jobdrop-2.0.0/jobdrop/findwork/__init__.py +163 -0
  15. jobdrop-2.0.0/jobdrop/glassdoor/__init__.py +365 -0
  16. jobdrop-2.0.0/jobdrop/glassdoor/constant.py +184 -0
  17. jobdrop-2.0.0/jobdrop/glassdoor/util.py +42 -0
  18. jobdrop-2.0.0/jobdrop/google/__init__.py +491 -0
  19. jobdrop-2.0.0/jobdrop/google/constant.py +52 -0
  20. jobdrop-2.0.0/jobdrop/google/util.py +41 -0
  21. jobdrop-2.0.0/jobdrop/greenhouse/__init__.py +498 -0
  22. jobdrop-2.0.0/jobdrop/hiring_cafe/__init__.py +396 -0
  23. jobdrop-2.0.0/jobdrop/hiring_cafe/util.py +3 -0
  24. jobdrop-2.0.0/jobdrop/indeed/__init__.py +306 -0
  25. jobdrop-2.0.0/jobdrop/indeed/constant.py +109 -0
  26. jobdrop-2.0.0/jobdrop/indeed/util.py +83 -0
  27. jobdrop-2.0.0/jobdrop/insightglobal/__init__.py +287 -0
  28. jobdrop-2.0.0/jobdrop/jooble/__init__.py +214 -0
  29. jobdrop-2.0.0/jobdrop/kforce/__init__.py +303 -0
  30. jobdrop-2.0.0/jobdrop/linkedin/__init__.py +427 -0
  31. jobdrop-2.0.0/jobdrop/linkedin/constant.py +8 -0
  32. jobdrop-2.0.0/jobdrop/linkedin/util.py +242 -0
  33. jobdrop-2.0.0/jobdrop/model.py +348 -0
  34. jobdrop-2.0.0/jobdrop/naukri/__init__.py +304 -0
  35. jobdrop-2.0.0/jobdrop/naukri/constant.py +11 -0
  36. jobdrop-2.0.0/jobdrop/naukri/util.py +38 -0
  37. jobdrop-2.0.0/jobdrop/the_muse/__init__.py +177 -0
  38. jobdrop-2.0.0/jobdrop/usajobs/__init__.py +203 -0
  39. jobdrop-2.0.0/jobdrop/util.py +363 -0
  40. jobdrop-2.0.0/jobdrop/wellfound/__init__.py +453 -0
  41. jobdrop-2.0.0/jobdrop/wellfound/util.py +3 -0
  42. jobdrop-2.0.0/jobdrop/ziprecruiter/__init__.py +285 -0
  43. jobdrop-2.0.0/jobdrop/ziprecruiter/constant.py +29 -0
  44. jobdrop-2.0.0/jobdrop/ziprecruiter/util.py +31 -0
  45. jobdrop-2.0.0/jobdrop_mcp_server/__init__.py +16 -0
  46. jobdrop-2.0.0/jobdrop_mcp_server/server.py +404 -0
  47. jobdrop-2.0.0/pyproject.toml +53 -0
jobdrop-2.0.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Cullen Watson
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
jobdrop-2.0.0/PKG-INFO ADDED
@@ -0,0 +1,210 @@
1
+ Metadata-Version: 2.4
2
+ Name: jobdrop
3
+ Version: 2.0.0
4
+ Summary: 20-source job scraper library + MCP server. LinkedIn, Indeed, Glassdoor, Google, ZipRecruiter, Wellfound, Hiring Cafe, Greenhouse, USAJobs, Adzuna, Jooble, Findwork, The Muse, Insight Global, Clearance Jobs, Kforce, CollabWork, Naukri, Bayt, BDJobs.
5
+ License-File: LICENSE
6
+ Keywords: jobs-scraper,mcp,linkedin,indeed,glassdoor,ziprecruiter,wellfound,hiring-cafe,greenhouse,usajobs
7
+ Author: kbwhodat
8
+ Author-email: katobyan@gmail.com
9
+ Requires-Python: >=3.10,<4.0
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.10
12
+ Classifier: Programming Language :: Python :: 3.11
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Classifier: Programming Language :: Python :: 3.13
15
+ Classifier: Programming Language :: Python :: 3.14
16
+ Provides-Extra: mcp
17
+ Requires-Dist: beautifulsoup4 (>=4.12.2,<5.0.0)
18
+ Requires-Dist: camoufox (>=0.4.11,<0.5.0)
19
+ Requires-Dist: curl-cffi (>=0.7.0,<0.8.0)
20
+ Requires-Dist: fastmcp (>=0.2.0) ; extra == "mcp"
21
+ Requires-Dist: markdownify (>=1.1.0,<2.0.0)
22
+ Requires-Dist: mcp (>=1.1.0) ; extra == "mcp"
23
+ Requires-Dist: numpy (>=1.26.0)
24
+ Requires-Dist: pandas (>=2.1.0,<3.0.0)
25
+ Requires-Dist: pydantic (>=2.3.0,<3.0.0)
26
+ Requires-Dist: regex (>=2024.4.28,<2025.0.0)
27
+ Requires-Dist: requests (>=2.31.0,<3.0.0)
28
+ Requires-Dist: selenium-driverless (>=1.9.4,<2.0.0)
29
+ Requires-Dist: tls-client (>=1.0.1,<2.0.0)
30
+ Project-URL: Homepage, https://github.com/kbwhodat/jobdrop
31
+ Description-Content-Type: text/markdown
32
+
33
+ # jobdrop
34
+
35
+ A multi-source job scraper. Hits 20 job boards in one call, normalizes
36
+ the results into a pandas DataFrame, and ships with anti-bot bypasses
37
+ for the boards that block standard scrapers.
38
+
39
+ ## What's in here
40
+
41
+ ### 17 sources
42
+
43
+ | `site_name` | Source | Mechanism |
44
+ |---|---|---|
45
+ | `linkedin` | LinkedIn | Public listing scrape with optional detail-page enrichment |
46
+ | `indeed` | Indeed | GraphQL with the `Int!` radius fix + per-company cap + paginate-until-quota |
47
+ | `glassdoor` | Glassdoor | selenium-driverless headless to defeat Cloudflare 403; in-page GraphQL fetch |
48
+ | `google` | Google Jobs | selenium-driverless headless against `udm=8`; SERP DOM walk |
49
+ | `zip_recruiter` | ZipRecruiter | `curl_cffi` + `safari17_2_ios` TLS impersonation against the web HTML endpoint |
50
+ | `bayt` | Bayt | Public scrape |
51
+ | `naukri` | Naukri | Public scrape |
52
+ | `bdjobs` | BDJobs | Public scrape |
53
+ | `usajobs` | USAJobs.gov | Federal public API |
54
+ | `adzuna` | Adzuna | Public API |
55
+ | `jooble` | Jooble | Public API |
56
+ | `findwork` | Findwork.dev | Public API |
57
+ | `the_muse` | The Muse | Public API |
58
+ | `insight_global` | Insight Global staffing | Server-rendered HTML scrape with hidden JSON blob per result |
59
+ | `clearance_jobs` | ClearanceJobs (DHI) | Public JSON API + parallel detail-page enrichment for full JD, salary, type, remote bool |
60
+ | `kforce` | Kforce staffing | Direct Azure Cognitive Search calls (bypasses Imperva on the public host) |
61
+ | `greenhouse` | Greenhouse-hosted boards | Google `site:` dorks via selenium-driverless → public Greenhouse API; 3-layer staleness filter |
62
+
63
+ ### Anti-bot solved
64
+
65
+ - **Google** — selenium-driverless cold-start headless. Defeats the 2026 CAPTCHA wall that takes out Playwright / undetected-chromedriver / nodriver / patchright.
66
+ - **Glassdoor** — selenium-driverless rewrite to bypass Cloudflare 403; URL-encoded location, partial-GraphQL-error tolerance.
67
+ - **ZipRecruiter** — `curl_cffi` + `safari17_2_ios` against the web HTML endpoint. The iOS-app API is dead behind Cloudflare.
68
+ - **Kforce** — bypasses Imperva on the public host by calling the Azure Cognitive Search backend directly.
69
+ - **Greenhouse** — uses the same selenium-driverless infrastructure as Google for `site:` dorks across all greenhouse-hosted boards.
70
+
71
+ ### Other tightening
72
+
73
+ - **LinkedIn** — salary extraction from description body, optional per-company cap, parallel detail fetches.
74
+ - **Indeed** — fixed `radius=25` default after Indeed promoted the GraphQL field to `Int!`; per-company cap to surface diverse employers; pagination loop fixed.
75
+ - **ClearanceJobs** — search API gives a 200-char preview; this fork parallel-fetches `/api/v1/jobs/{id}` so you get the full JD, salary range, structured `job_type`, and authoritative `remote` bool.
76
+ - **Greenhouse** — three layers of stale-protection (404 drop / past `application_deadline` / `first_published` age with 90-day default that respects `hours_old`).
77
+
78
+ ### Bundled credentials
79
+
80
+ API keys for USAJobs, Adzuna, Jooble, Findwork, and The Muse are baked
81
+ into a positional resolver (`jobdrop/_defaults.py`) so the new sources
82
+ work without environment setup. User-set env vars still win via
83
+ `setdefault` semantics.
84
+
85
+ ## Installation
86
+
87
+ ```
88
+ pip install -U jobdrop
89
+ ```
90
+
91
+ Python ≥ 3.10 required.
92
+
93
+ ## Usage
94
+
95
+ ```python
96
+ from jobdrop import scrape_jobs
97
+
98
+ jobs = scrape_jobs(
99
+ site_name=["insight_global", "clearance_jobs", "kforce", "greenhouse",
100
+ "linkedin", "indeed", "google"],
101
+ search_term="site reliability engineer",
102
+ location="Atlanta, GA",
103
+ results_wanted=20,
104
+ hours_old=720, # 30-day freshness cap
105
+ country_indeed="usa",
106
+ )
107
+ print(f"Found {len(jobs)} jobs")
108
+ print(jobs[["site", "title", "company", "location", "min_amount", "max_amount", "job_url"]].head())
109
+ ```
110
+
111
+ ## Parameters
112
+
113
+ ```
114
+ scrape_jobs(
115
+ site_name list[str] | str — any of the 17 sources above (default: all)
116
+ search_term str — keyword query
117
+ google_search_term str — Google Jobs override (only filter for `google`)
118
+ location str — "City, ST" or ZIP. Each scraper geocodes its own way.
119
+ distance int — radius miles, default 50
120
+ is_remote bool — remote-only filter (where supported)
121
+ job_type str — "fulltime" | "parttime" | "contract" | "internship"
122
+ easy_apply bool — direct-board apply only (LinkedIn easy-apply is broken)
123
+ results_wanted int — per-site target
124
+ offset int — pagination offset
125
+ hours_old int — drop postings older than N hours
126
+ country_indeed str — Indeed/Glassdoor country (see list below)
127
+ description_format str — "markdown" | "html"
128
+ enforce_annual_salary bool — convert hourly/monthly to yearly
129
+ linkedin_fetch_description bool — full JD + direct URL (slower)
130
+ linkedin_company_ids list[int] — filter LinkedIn by company IDs
131
+ proxies list[str] — round-robin proxies, "user:pass@host:port"
132
+ ca_cert str — CA cert path for proxies
133
+ user_agent str — override the default UA
134
+ verbose int — 0 errors / 1 warnings / 2 all
135
+ )
136
+ ```
137
+
138
+ ### Per-scraper limitations
139
+
140
+ - **Indeed** — only one of `hours_old` / (`job_type`+`is_remote`) / `easy_apply` per call.
141
+ - **LinkedIn** — only one of `hours_old` / `easy_apply` per call.
142
+ - **ClearanceJobs** — location/remote filters require facet IDs from the dropdown endpoints (not implemented). Filter client-side or scope by keyword.
143
+ - **InsightGlobal** — does not expose client-company name (it's the staffing firm). `is_remote` is not available in their data.
144
+ - **Greenhouse** — Google indexes some postings after they're filled. Stale 404s are filtered out; the freshness cutoff filters "live but ancient" postings (default 90 days, override with `hours_old`).
145
+
146
+ ## JobPost schema
147
+
148
+ ```
149
+ JobPost
150
+ ├── id, title, company_name, company_url, job_url
151
+ ├── location { country, city, state }
152
+ ├── description
153
+ ├── is_remote
154
+ ├── date_posted
155
+ ├── job_type fulltime | parttime | contract | internship
156
+ ├── compensation
157
+ │ ├── interval yearly | monthly | weekly | daily | hourly
158
+ │ ├── min_amount, max_amount, currency
159
+ │ └── salary_source
160
+ ├── job_level (LinkedIn, ClearanceJobs)
161
+ ├── company_industry (LinkedIn, Indeed, Greenhouse, Kforce)
162
+ ├── company_country, company_addresses,
163
+ │ company_employees_label, company_revenue_label,
164
+ │ company_description, company_logo (Indeed)
165
+ ├── skills, experience_range,
166
+ │ company_rating, company_reviews_count,
167
+ │ vacancy_count, work_from_home_type (Naukri)
168
+ └── emails
169
+ ```
170
+
171
+ ## Indeed / Glassdoor country list
172
+
173
+ Pass `country_indeed` (use the exact name; `*` = also supported on Glassdoor):
174
+
175
+ | | | | |
176
+ |---|---|---|---|
177
+ | Argentina | Australia* | Austria* | Bahrain |
178
+ | Belgium* | Brazil* | Canada* | Chile |
179
+ | China | Colombia | Costa Rica | Czech Republic |
180
+ | Denmark | Ecuador | Egypt | Finland |
181
+ | France* | Germany* | Greece | Hong Kong* |
182
+ | Hungary | India* | Indonesia | Ireland* |
183
+ | Israel | Italy* | Japan | Kuwait |
184
+ | Luxembourg | Malaysia | Mexico* | Morocco |
185
+ | Netherlands* | New Zealand* | Nigeria | Norway |
186
+ | Oman | Pakistan | Panama | Peru |
187
+ | Philippines | Poland | Portugal | Qatar |
188
+ | Romania | Saudi Arabia | Singapore* | South Africa |
189
+ | South Korea | Spain* | Sweden | Switzerland* |
190
+ | Taiwan | Thailand | Turkey | Ukraine |
191
+ | United Arab Emirates | UK* | USA* | Uruguay |
192
+ | Venezuela | Vietnam* | | |
193
+
194
+ LinkedIn searches globally and uses only `location`. ZipRecruiter is US/Canada and uses only `location`. Bayt searches internationally with only `search_term`.
195
+
196
+ ## Notes
197
+
198
+ - Most boards cap a single search at ~1000 results.
199
+ - LinkedIn rate-limits aggressively around the 10th page of pagination on a single IP. Use `proxies`.
200
+ - For Indeed search-term tuning: it searches the description too. Use `-foo` to exclude, `"exact phrase"` for exact match. Example:
201
+ ```python
202
+ search_term='"site reliability engineer" (kubernetes OR terraform) -recruiter'
203
+ ```
204
+ - For Google: copy the exact filter syntax from a real Google Jobs search and pass it as `google_search_term`.
205
+ - For Greenhouse: keyword + location are passed straight to a Google `site:greenhouse.io` query, so Boolean operators and quotes work. Don't quote the full `"City, ST"` — quote the city alone, leave the state bare.
206
+
207
+ ## License
208
+
209
+ MIT. See `LICENSE`.
210
+
@@ -0,0 +1,177 @@
1
+ # jobdrop
2
+
3
+ A multi-source job scraper. Hits 20 job boards in one call, normalizes
4
+ the results into a pandas DataFrame, and ships with anti-bot bypasses
5
+ for the boards that block standard scrapers.
6
+
7
+ ## What's in here
8
+
9
+ ### 17 sources
10
+
11
+ | `site_name` | Source | Mechanism |
12
+ |---|---|---|
13
+ | `linkedin` | LinkedIn | Public listing scrape with optional detail-page enrichment |
14
+ | `indeed` | Indeed | GraphQL with the `Int!` radius fix + per-company cap + paginate-until-quota |
15
+ | `glassdoor` | Glassdoor | selenium-driverless headless to defeat Cloudflare 403; in-page GraphQL fetch |
16
+ | `google` | Google Jobs | selenium-driverless headless against `udm=8`; SERP DOM walk |
17
+ | `zip_recruiter` | ZipRecruiter | `curl_cffi` + `safari17_2_ios` TLS impersonation against the web HTML endpoint |
18
+ | `bayt` | Bayt | Public scrape |
19
+ | `naukri` | Naukri | Public scrape |
20
+ | `bdjobs` | BDJobs | Public scrape |
21
+ | `usajobs` | USAJobs.gov | Federal public API |
22
+ | `adzuna` | Adzuna | Public API |
23
+ | `jooble` | Jooble | Public API |
24
+ | `findwork` | Findwork.dev | Public API |
25
+ | `the_muse` | The Muse | Public API |
26
+ | `insight_global` | Insight Global staffing | Server-rendered HTML scrape with hidden JSON blob per result |
27
+ | `clearance_jobs` | ClearanceJobs (DHI) | Public JSON API + parallel detail-page enrichment for full JD, salary, type, remote bool |
28
+ | `kforce` | Kforce staffing | Direct Azure Cognitive Search calls (bypasses Imperva on the public host) |
29
+ | `greenhouse` | Greenhouse-hosted boards | Google `site:` dorks via selenium-driverless → public Greenhouse API; 3-layer staleness filter |
30
+
31
+ ### Anti-bot solved
32
+
33
+ - **Google** — selenium-driverless cold-start headless. Defeats the 2026 CAPTCHA wall that takes out Playwright / undetected-chromedriver / nodriver / patchright.
34
+ - **Glassdoor** — selenium-driverless rewrite to bypass Cloudflare 403; URL-encoded location, partial-GraphQL-error tolerance.
35
+ - **ZipRecruiter** — `curl_cffi` + `safari17_2_ios` against the web HTML endpoint. The iOS-app API is dead behind Cloudflare.
36
+ - **Kforce** — bypasses Imperva on the public host by calling the Azure Cognitive Search backend directly.
37
+ - **Greenhouse** — uses the same selenium-driverless infrastructure as Google for `site:` dorks across all greenhouse-hosted boards.
38
+
39
+ ### Other tightening
40
+
41
+ - **LinkedIn** — salary extraction from description body, optional per-company cap, parallel detail fetches.
42
+ - **Indeed** — fixed `radius=25` default after Indeed promoted the GraphQL field to `Int!`; per-company cap to surface diverse employers; pagination loop fixed.
43
+ - **ClearanceJobs** — search API gives a 200-char preview; this fork parallel-fetches `/api/v1/jobs/{id}` so you get the full JD, salary range, structured `job_type`, and authoritative `remote` bool.
44
+ - **Greenhouse** — three layers of stale-protection (404 drop / past `application_deadline` / `first_published` age with 90-day default that respects `hours_old`).
45
+
46
+ ### Bundled credentials
47
+
48
+ API keys for USAJobs, Adzuna, Jooble, Findwork, and The Muse are baked
49
+ into a positional resolver (`jobdrop/_defaults.py`) so the new sources
50
+ work without environment setup. User-set env vars still win via
51
+ `setdefault` semantics.
52
+
53
+ ## Installation
54
+
55
+ ```
56
+ pip install -U jobdrop
57
+ ```
58
+
59
+ Python ≥ 3.10 required.
60
+
61
+ ## Usage
62
+
63
+ ```python
64
+ from jobdrop import scrape_jobs
65
+
66
+ jobs = scrape_jobs(
67
+ site_name=["insight_global", "clearance_jobs", "kforce", "greenhouse",
68
+ "linkedin", "indeed", "google"],
69
+ search_term="site reliability engineer",
70
+ location="Atlanta, GA",
71
+ results_wanted=20,
72
+ hours_old=720, # 30-day freshness cap
73
+ country_indeed="usa",
74
+ )
75
+ print(f"Found {len(jobs)} jobs")
76
+ print(jobs[["site", "title", "company", "location", "min_amount", "max_amount", "job_url"]].head())
77
+ ```
78
+
79
+ ## Parameters
80
+
81
+ ```
82
+ scrape_jobs(
83
+ site_name list[str] | str — any of the 17 sources above (default: all)
84
+ search_term str — keyword query
85
+ google_search_term str — Google Jobs override (only filter for `google`)
86
+ location str — "City, ST" or ZIP. Each scraper geocodes its own way.
87
+ distance int — radius miles, default 50
88
+ is_remote bool — remote-only filter (where supported)
89
+ job_type str — "fulltime" | "parttime" | "contract" | "internship"
90
+ easy_apply bool — direct-board apply only (LinkedIn easy-apply is broken)
91
+ results_wanted int — per-site target
92
+ offset int — pagination offset
93
+ hours_old int — drop postings older than N hours
94
+ country_indeed str — Indeed/Glassdoor country (see list below)
95
+ description_format str — "markdown" | "html"
96
+ enforce_annual_salary bool — convert hourly/monthly to yearly
97
+ linkedin_fetch_description bool — full JD + direct URL (slower)
98
+ linkedin_company_ids list[int] — filter LinkedIn by company IDs
99
+ proxies list[str] — round-robin proxies, "user:pass@host:port"
100
+ ca_cert str — CA cert path for proxies
101
+ user_agent str — override the default UA
102
+ verbose int — 0 errors / 1 warnings / 2 all
103
+ )
104
+ ```
105
+
106
+ ### Per-scraper limitations
107
+
108
+ - **Indeed** — only one of `hours_old` / (`job_type`+`is_remote`) / `easy_apply` per call.
109
+ - **LinkedIn** — only one of `hours_old` / `easy_apply` per call.
110
+ - **ClearanceJobs** — location/remote filters require facet IDs from the dropdown endpoints (not implemented). Filter client-side or scope by keyword.
111
+ - **InsightGlobal** — does not expose client-company name (it's the staffing firm). `is_remote` is not available in their data.
112
+ - **Greenhouse** — Google indexes some postings after they're filled. Stale 404s are filtered out; the freshness cutoff filters "live but ancient" postings (default 90 days, override with `hours_old`).
113
+
114
+ ## JobPost schema
115
+
116
+ ```
117
+ JobPost
118
+ ├── id, title, company_name, company_url, job_url
119
+ ├── location { country, city, state }
120
+ ├── description
121
+ ├── is_remote
122
+ ├── date_posted
123
+ ├── job_type fulltime | parttime | contract | internship
124
+ ├── compensation
125
+ │ ├── interval yearly | monthly | weekly | daily | hourly
126
+ │ ├── min_amount, max_amount, currency
127
+ │ └── salary_source
128
+ ├── job_level (LinkedIn, ClearanceJobs)
129
+ ├── company_industry (LinkedIn, Indeed, Greenhouse, Kforce)
130
+ ├── company_country, company_addresses,
131
+ │ company_employees_label, company_revenue_label,
132
+ │ company_description, company_logo (Indeed)
133
+ ├── skills, experience_range,
134
+ │ company_rating, company_reviews_count,
135
+ │ vacancy_count, work_from_home_type (Naukri)
136
+ └── emails
137
+ ```
138
+
139
+ ## Indeed / Glassdoor country list
140
+
141
+ Pass `country_indeed` (use the exact name; `*` = also supported on Glassdoor):
142
+
143
+ | | | | |
144
+ |---|---|---|---|
145
+ | Argentina | Australia* | Austria* | Bahrain |
146
+ | Belgium* | Brazil* | Canada* | Chile |
147
+ | China | Colombia | Costa Rica | Czech Republic |
148
+ | Denmark | Ecuador | Egypt | Finland |
149
+ | France* | Germany* | Greece | Hong Kong* |
150
+ | Hungary | India* | Indonesia | Ireland* |
151
+ | Israel | Italy* | Japan | Kuwait |
152
+ | Luxembourg | Malaysia | Mexico* | Morocco |
153
+ | Netherlands* | New Zealand* | Nigeria | Norway |
154
+ | Oman | Pakistan | Panama | Peru |
155
+ | Philippines | Poland | Portugal | Qatar |
156
+ | Romania | Saudi Arabia | Singapore* | South Africa |
157
+ | South Korea | Spain* | Sweden | Switzerland* |
158
+ | Taiwan | Thailand | Turkey | Ukraine |
159
+ | United Arab Emirates | UK* | USA* | Uruguay |
160
+ | Venezuela | Vietnam* | | |
161
+
162
+ LinkedIn searches globally and uses only `location`. ZipRecruiter is US/Canada and uses only `location`. Bayt searches internationally with only `search_term`.
163
+
164
+ ## Notes
165
+
166
+ - Most boards cap a single search at ~1000 results.
167
+ - LinkedIn rate-limits aggressively around the 10th page of pagination on a single IP. Use `proxies`.
168
+ - For Indeed search-term tuning: it searches the description too. Use `-foo` to exclude, `"exact phrase"` for exact match. Example:
169
+ ```python
170
+ search_term='"site reliability engineer" (kubernetes OR terraform) -recruiter'
171
+ ```
172
+ - For Google: copy the exact filter syntax from a real Google Jobs search and pass it as `google_search_term`.
173
+ - For Greenhouse: keyword + location are passed straight to a Google `site:greenhouse.io` query, so Boolean operators and quotes work. Don't quote the full `"City, ST"` — quote the city alone, leave the state bare.
174
+
175
+ ## License
176
+
177
+ MIT. See `LICENSE`.
@@ -0,0 +1,256 @@
1
+ from __future__ import annotations
2
+
3
+ # Apply compiled defaults BEFORE any scraper module runs its module-level
4
+ # os.environ reads. User-set env vars are preserved (setdefault semantics).
5
+ from jobdrop import _defaults # noqa: F401
6
+
7
+ from concurrent.futures import ThreadPoolExecutor, as_completed
8
+ from typing import Tuple
9
+
10
+ import pandas as pd
11
+
12
+ from jobdrop.adzuna import Adzuna
13
+ from jobdrop.bayt import BaytScraper
14
+ from jobdrop.bdjobs import BDJobs
15
+ from jobdrop.clearancejobs import ClearanceJobs
16
+ from jobdrop.collabwork import CollabWork
17
+ from jobdrop.findwork import Findwork
18
+ from jobdrop.glassdoor import Glassdoor
19
+ from jobdrop.google import Google
20
+ from jobdrop.greenhouse import Greenhouse
21
+ from jobdrop.hiring_cafe import HiringCafe
22
+ from jobdrop.indeed import Indeed
23
+ from jobdrop.insightglobal import InsightGlobal
24
+ from jobdrop.jooble import Jooble
25
+ from jobdrop.kforce import Kforce
26
+ from jobdrop.linkedin import LinkedIn
27
+ from jobdrop.naukri import Naukri
28
+ from jobdrop.the_muse import TheMuse
29
+ from jobdrop.usajobs import USAJobs
30
+ from jobdrop.wellfound import Wellfound
31
+ from jobdrop.model import JobType, Location, JobResponse, Country
32
+ from jobdrop.model import SalarySource, ScraperInput, Site
33
+ from jobdrop.util import (
34
+ set_logger_level,
35
+ extract_salary,
36
+ create_logger,
37
+ get_enum_from_value,
38
+ map_str_to_site,
39
+ convert_to_annual,
40
+ desired_order,
41
+ )
42
+ from jobdrop.ziprecruiter import ZipRecruiter
43
+
44
+
45
+ # Update the SCRAPER_MAPPING dictionary in the scrape_jobs function
46
+
47
+ def scrape_jobs(
48
+ site_name: str | list[str] | Site | list[Site] | None = None,
49
+ search_term: str | None = None,
50
+ google_search_term: str | None = None,
51
+ location: str | None = None,
52
+ distance: int | None = 50,
53
+ is_remote: bool = False,
54
+ job_type: str | None = None,
55
+ easy_apply: bool | None = None,
56
+ results_wanted: int = 15,
57
+ country_indeed: str = "usa",
58
+ proxies: list[str] | str | None = None,
59
+ ca_cert: str | None = None,
60
+ description_format: str = "markdown",
61
+ linkedin_fetch_description: bool | None = False,
62
+ linkedin_company_ids: list[int] | None = None,
63
+ offset: int | None = 0,
64
+ hours_old: int = None,
65
+ enforce_annual_salary: bool = False,
66
+ verbose: int = 0,
67
+ user_agent: str = None,
68
+ **kwargs,
69
+ ) -> pd.DataFrame:
70
+ """
71
+ Scrapes job data from job boards concurrently
72
+ :return: Pandas DataFrame containing job data
73
+ """
74
+ SCRAPER_MAPPING = {
75
+ Site.LINKEDIN: LinkedIn,
76
+ Site.INDEED: Indeed,
77
+ Site.ZIP_RECRUITER: ZipRecruiter,
78
+ Site.GLASSDOOR: Glassdoor,
79
+ Site.GOOGLE: Google,
80
+ Site.BAYT: BaytScraper,
81
+ Site.NAUKRI: Naukri,
82
+ Site.BDJOBS: BDJobs,
83
+ # API-based sources added in the kbwhodat fork
84
+ Site.USAJOBS: USAJobs,
85
+ Site.ADZUNA: Adzuna,
86
+ Site.JOOBLE: Jooble,
87
+ Site.FINDWORK: Findwork,
88
+ Site.THE_MUSE: TheMuse,
89
+ Site.INSIGHT_GLOBAL: InsightGlobal,
90
+ Site.CLEARANCE_JOBS: ClearanceJobs,
91
+ Site.KFORCE: Kforce,
92
+ Site.GREENHOUSE: Greenhouse,
93
+ Site.COLLAB_WORK: CollabWork,
94
+ Site.WELLFOUND: Wellfound,
95
+ Site.HIRING_CAFE: HiringCafe,
96
+ }
97
+ set_logger_level(verbose)
98
+ job_type = get_enum_from_value(job_type) if job_type else None
99
+
100
+ def get_site_type():
101
+ site_types = list(Site)
102
+ if isinstance(site_name, str):
103
+ site_types = [map_str_to_site(site_name)]
104
+ elif isinstance(site_name, Site):
105
+ site_types = [site_name]
106
+ elif isinstance(site_name, list):
107
+ site_types = [
108
+ map_str_to_site(site) if isinstance(site, str) else site
109
+ for site in site_name
110
+ ]
111
+ return site_types
112
+
113
+ country_enum = Country.from_string(country_indeed)
114
+
115
+ scraper_input = ScraperInput(
116
+ site_type=get_site_type(),
117
+ country=country_enum,
118
+ search_term=search_term,
119
+ google_search_term=google_search_term,
120
+ location=location,
121
+ distance=distance,
122
+ is_remote=is_remote,
123
+ job_type=job_type,
124
+ easy_apply=easy_apply,
125
+ description_format=description_format,
126
+ linkedin_fetch_description=linkedin_fetch_description,
127
+ results_wanted=results_wanted,
128
+ linkedin_company_ids=linkedin_company_ids,
129
+ offset=offset,
130
+ hours_old=hours_old,
131
+ )
132
+
133
+ def scrape_site(site: Site) -> Tuple[str, JobResponse]:
134
+ scraper_class = SCRAPER_MAPPING[site]
135
+ scraper = scraper_class(proxies=proxies, ca_cert=ca_cert, user_agent=user_agent)
136
+ scraped_data: JobResponse = scraper.scrape(scraper_input)
137
+ cap_name = site.value.capitalize()
138
+ site_name = "ZipRecruiter" if cap_name == "Zip_recruiter" else cap_name
139
+ site_name = "LinkedIn" if cap_name == "Linkedin" else cap_name
140
+ create_logger(site_name).info(f"finished scraping")
141
+ return site.value, scraped_data
142
+
143
+ site_to_jobs_dict = {}
144
+
145
+ def worker(site):
146
+ site_val, scraped_info = scrape_site(site)
147
+ return site_val, scraped_info
148
+
149
+ with ThreadPoolExecutor() as executor:
150
+ future_to_site = {
151
+ executor.submit(worker, site): site for site in scraper_input.site_type
152
+ }
153
+
154
+ for future in as_completed(future_to_site):
155
+ site_value, scraped_data = future.result()
156
+ site_to_jobs_dict[site_value] = scraped_data
157
+
158
+ jobs_dfs: list[pd.DataFrame] = []
159
+
160
+ for site, job_response in site_to_jobs_dict.items():
161
+ for job in job_response.jobs:
162
+ job_data = job.dict()
163
+ job_url = job_data["job_url"]
164
+ job_data["site"] = site
165
+ job_data["company"] = job_data["company_name"]
166
+ job_data["job_type"] = (
167
+ ", ".join(job_type.value[0] for job_type in job_data["job_type"])
168
+ if job_data["job_type"]
169
+ else None
170
+ )
171
+ job_data["emails"] = (
172
+ ", ".join(job_data["emails"]) if job_data["emails"] else None
173
+ )
174
+ if job_data["location"]:
175
+ job_data["location"] = Location(
176
+ **job_data["location"]
177
+ ).display_location()
178
+
179
+ # Handle compensation
180
+ compensation_obj = job_data.get("compensation")
181
+ if compensation_obj and isinstance(compensation_obj, dict):
182
+ job_data["interval"] = (
183
+ compensation_obj.get("interval").value
184
+ if compensation_obj.get("interval")
185
+ else None
186
+ )
187
+ job_data["min_amount"] = compensation_obj.get("min_amount")
188
+ job_data["max_amount"] = compensation_obj.get("max_amount")
189
+ job_data["currency"] = compensation_obj.get("currency", "USD")
190
+ job_data["salary_source"] = SalarySource.DIRECT_DATA.value
191
+ if enforce_annual_salary and (
192
+ job_data["interval"]
193
+ and job_data["interval"] != "yearly"
194
+ and job_data["min_amount"]
195
+ and job_data["max_amount"]
196
+ ):
197
+ convert_to_annual(job_data)
198
+ else:
199
+ if country_enum == Country.USA:
200
+ (
201
+ job_data["interval"],
202
+ job_data["min_amount"],
203
+ job_data["max_amount"],
204
+ job_data["currency"],
205
+ ) = extract_salary(
206
+ job_data["description"],
207
+ enforce_annual_salary=enforce_annual_salary,
208
+ )
209
+ job_data["salary_source"] = SalarySource.DESCRIPTION.value
210
+
211
+ job_data["salary_source"] = (
212
+ job_data["salary_source"]
213
+ if "min_amount" in job_data and job_data["min_amount"]
214
+ else None
215
+ )
216
+
217
+ #naukri-specific fields
218
+ job_data["skills"] = (
219
+ ", ".join(job_data["skills"]) if job_data["skills"] else None
220
+ )
221
+ job_data["experience_range"] = job_data.get("experience_range")
222
+ job_data["company_rating"] = job_data.get("company_rating")
223
+ job_data["company_reviews_count"] = job_data.get("company_reviews_count")
224
+ job_data["vacancy_count"] = job_data.get("vacancy_count")
225
+ job_data["work_from_home_type"] = job_data.get("work_from_home_type")
226
+
227
+ job_df = pd.DataFrame([job_data])
228
+ jobs_dfs.append(job_df)
229
+
230
+ if jobs_dfs:
231
+ # Step 1: Filter out all-NA columns from each DataFrame before concatenation
232
+ filtered_dfs = [df.dropna(axis=1, how="all") for df in jobs_dfs]
233
+
234
+ # Step 2: Concatenate the filtered DataFrames
235
+ jobs_df = pd.concat(filtered_dfs, ignore_index=True)
236
+
237
+ # Step 3: Ensure all desired columns are present, adding missing ones as empty
238
+ for column in desired_order:
239
+ if column not in jobs_df.columns:
240
+ jobs_df[column] = None # Add missing columns as empty
241
+
242
+ # Reorder the DataFrame according to the desired order
243
+ jobs_df = jobs_df[desired_order]
244
+
245
+ # Step 4: Sort the DataFrame as required
246
+ return jobs_df.sort_values(
247
+ by=["site", "date_posted"], ascending=[True, False]
248
+ ).reset_index(drop=True)
249
+ else:
250
+ return pd.DataFrame()
251
+
252
+
253
+ # Add BDJobs to __all__
254
+ __all__ = [
255
+ "BDJobs",
256
+ ]