jobdrop 2.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- jobdrop-2.0.0/LICENSE +21 -0
- jobdrop-2.0.0/PKG-INFO +210 -0
- jobdrop-2.0.0/README.md +177 -0
- jobdrop-2.0.0/jobdrop/__init__.py +256 -0
- jobdrop-2.0.0/jobdrop/_defaults.py +64 -0
- jobdrop-2.0.0/jobdrop/adzuna/__init__.py +238 -0
- jobdrop-2.0.0/jobdrop/bayt/__init__.py +145 -0
- jobdrop-2.0.0/jobdrop/bdjobs/__init__.py +353 -0
- jobdrop-2.0.0/jobdrop/bdjobs/constant.py +32 -0
- jobdrop-2.0.0/jobdrop/bdjobs/util.py +100 -0
- jobdrop-2.0.0/jobdrop/clearancejobs/__init__.py +426 -0
- jobdrop-2.0.0/jobdrop/collabwork/__init__.py +402 -0
- jobdrop-2.0.0/jobdrop/exception.py +45 -0
- jobdrop-2.0.0/jobdrop/findwork/__init__.py +163 -0
- jobdrop-2.0.0/jobdrop/glassdoor/__init__.py +365 -0
- jobdrop-2.0.0/jobdrop/glassdoor/constant.py +184 -0
- jobdrop-2.0.0/jobdrop/glassdoor/util.py +42 -0
- jobdrop-2.0.0/jobdrop/google/__init__.py +491 -0
- jobdrop-2.0.0/jobdrop/google/constant.py +52 -0
- jobdrop-2.0.0/jobdrop/google/util.py +41 -0
- jobdrop-2.0.0/jobdrop/greenhouse/__init__.py +498 -0
- jobdrop-2.0.0/jobdrop/hiring_cafe/__init__.py +396 -0
- jobdrop-2.0.0/jobdrop/hiring_cafe/util.py +3 -0
- jobdrop-2.0.0/jobdrop/indeed/__init__.py +306 -0
- jobdrop-2.0.0/jobdrop/indeed/constant.py +109 -0
- jobdrop-2.0.0/jobdrop/indeed/util.py +83 -0
- jobdrop-2.0.0/jobdrop/insightglobal/__init__.py +287 -0
- jobdrop-2.0.0/jobdrop/jooble/__init__.py +214 -0
- jobdrop-2.0.0/jobdrop/kforce/__init__.py +303 -0
- jobdrop-2.0.0/jobdrop/linkedin/__init__.py +427 -0
- jobdrop-2.0.0/jobdrop/linkedin/constant.py +8 -0
- jobdrop-2.0.0/jobdrop/linkedin/util.py +242 -0
- jobdrop-2.0.0/jobdrop/model.py +348 -0
- jobdrop-2.0.0/jobdrop/naukri/__init__.py +304 -0
- jobdrop-2.0.0/jobdrop/naukri/constant.py +11 -0
- jobdrop-2.0.0/jobdrop/naukri/util.py +38 -0
- jobdrop-2.0.0/jobdrop/the_muse/__init__.py +177 -0
- jobdrop-2.0.0/jobdrop/usajobs/__init__.py +203 -0
- jobdrop-2.0.0/jobdrop/util.py +363 -0
- jobdrop-2.0.0/jobdrop/wellfound/__init__.py +453 -0
- jobdrop-2.0.0/jobdrop/wellfound/util.py +3 -0
- jobdrop-2.0.0/jobdrop/ziprecruiter/__init__.py +285 -0
- jobdrop-2.0.0/jobdrop/ziprecruiter/constant.py +29 -0
- jobdrop-2.0.0/jobdrop/ziprecruiter/util.py +31 -0
- jobdrop-2.0.0/jobdrop_mcp_server/__init__.py +16 -0
- jobdrop-2.0.0/jobdrop_mcp_server/server.py +404 -0
- jobdrop-2.0.0/pyproject.toml +53 -0
jobdrop-2.0.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2023 Cullen Watson
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
jobdrop-2.0.0/PKG-INFO
ADDED
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: jobdrop
|
|
3
|
+
Version: 2.0.0
|
|
4
|
+
Summary: 20-source job scraper library + MCP server. LinkedIn, Indeed, Glassdoor, Google, ZipRecruiter, Wellfound, Hiring Cafe, Greenhouse, USAJobs, Adzuna, Jooble, Findwork, The Muse, Insight Global, Clearance Jobs, Kforce, CollabWork, Naukri, Bayt, BDJobs.
|
|
5
|
+
License-File: LICENSE
|
|
6
|
+
Keywords: jobs-scraper,mcp,linkedin,indeed,glassdoor,ziprecruiter,wellfound,hiring-cafe,greenhouse,usajobs
|
|
7
|
+
Author: kbwhodat
|
|
8
|
+
Author-email: katobyan@gmail.com
|
|
9
|
+
Requires-Python: >=3.10,<4.0
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
16
|
+
Provides-Extra: mcp
|
|
17
|
+
Requires-Dist: beautifulsoup4 (>=4.12.2,<5.0.0)
|
|
18
|
+
Requires-Dist: camoufox (>=0.4.11,<0.5.0)
|
|
19
|
+
Requires-Dist: curl-cffi (>=0.7.0,<0.8.0)
|
|
20
|
+
Requires-Dist: fastmcp (>=0.2.0) ; extra == "mcp"
|
|
21
|
+
Requires-Dist: markdownify (>=1.1.0,<2.0.0)
|
|
22
|
+
Requires-Dist: mcp (>=1.1.0) ; extra == "mcp"
|
|
23
|
+
Requires-Dist: numpy (>=1.26.0)
|
|
24
|
+
Requires-Dist: pandas (>=2.1.0,<3.0.0)
|
|
25
|
+
Requires-Dist: pydantic (>=2.3.0,<3.0.0)
|
|
26
|
+
Requires-Dist: regex (>=2024.4.28,<2025.0.0)
|
|
27
|
+
Requires-Dist: requests (>=2.31.0,<3.0.0)
|
|
28
|
+
Requires-Dist: selenium-driverless (>=1.9.4,<2.0.0)
|
|
29
|
+
Requires-Dist: tls-client (>=1.0.1,<2.0.0)
|
|
30
|
+
Project-URL: Homepage, https://github.com/kbwhodat/jobdrop
|
|
31
|
+
Description-Content-Type: text/markdown
|
|
32
|
+
|
|
33
|
+
# jobdrop
|
|
34
|
+
|
|
35
|
+
A multi-source job scraper. Hits 20 job boards in one call, normalizes
|
|
36
|
+
the results into a pandas DataFrame, and ships with anti-bot bypasses
|
|
37
|
+
for the boards that block standard scrapers.
|
|
38
|
+
|
|
39
|
+
## What's in here
|
|
40
|
+
|
|
41
|
+
### 17 sources
|
|
42
|
+
|
|
43
|
+
| `site_name` | Source | Mechanism |
|
|
44
|
+
|---|---|---|
|
|
45
|
+
| `linkedin` | LinkedIn | Public listing scrape with optional detail-page enrichment |
|
|
46
|
+
| `indeed` | Indeed | GraphQL with the `Int!` radius fix + per-company cap + paginate-until-quota |
|
|
47
|
+
| `glassdoor` | Glassdoor | selenium-driverless headless to defeat Cloudflare 403; in-page GraphQL fetch |
|
|
48
|
+
| `google` | Google Jobs | selenium-driverless headless against `udm=8`; SERP DOM walk |
|
|
49
|
+
| `zip_recruiter` | ZipRecruiter | `curl_cffi` + `safari17_2_ios` TLS impersonation against the web HTML endpoint |
|
|
50
|
+
| `bayt` | Bayt | Public scrape |
|
|
51
|
+
| `naukri` | Naukri | Public scrape |
|
|
52
|
+
| `bdjobs` | BDJobs | Public scrape |
|
|
53
|
+
| `usajobs` | USAJobs.gov | Federal public API |
|
|
54
|
+
| `adzuna` | Adzuna | Public API |
|
|
55
|
+
| `jooble` | Jooble | Public API |
|
|
56
|
+
| `findwork` | Findwork.dev | Public API |
|
|
57
|
+
| `the_muse` | The Muse | Public API |
|
|
58
|
+
| `insight_global` | Insight Global staffing | Server-rendered HTML scrape with hidden JSON blob per result |
|
|
59
|
+
| `clearance_jobs` | ClearanceJobs (DHI) | Public JSON API + parallel detail-page enrichment for full JD, salary, type, remote bool |
|
|
60
|
+
| `kforce` | Kforce staffing | Direct Azure Cognitive Search calls (bypasses Imperva on the public host) |
|
|
61
|
+
| `greenhouse` | Greenhouse-hosted boards | Google `site:` dorks via selenium-driverless → public Greenhouse API; 3-layer staleness filter |
|
|
62
|
+
|
|
63
|
+
### Anti-bot solved
|
|
64
|
+
|
|
65
|
+
- **Google** — selenium-driverless cold-start headless. Defeats the 2026 CAPTCHA wall that takes out Playwright / undetected-chromedriver / nodriver / patchright.
|
|
66
|
+
- **Glassdoor** — selenium-driverless rewrite to bypass Cloudflare 403; URL-encoded location, partial-GraphQL-error tolerance.
|
|
67
|
+
- **ZipRecruiter** — `curl_cffi` + `safari17_2_ios` against the web HTML endpoint. The iOS-app API is dead behind Cloudflare.
|
|
68
|
+
- **Kforce** — bypasses Imperva on the public host by calling the Azure Cognitive Search backend directly.
|
|
69
|
+
- **Greenhouse** — uses the same selenium-driverless infrastructure as Google for `site:` dorks across all greenhouse-hosted boards.
|
|
70
|
+
|
|
71
|
+
### Other tightening
|
|
72
|
+
|
|
73
|
+
- **LinkedIn** — salary extraction from description body, optional per-company cap, parallel detail fetches.
|
|
74
|
+
- **Indeed** — fixed `radius=25` default after Indeed promoted the GraphQL field to `Int!`; per-company cap to surface diverse employers; pagination loop fixed.
|
|
75
|
+
- **ClearanceJobs** — search API gives a 200-char preview; this fork parallel-fetches `/api/v1/jobs/{id}` so you get the full JD, salary range, structured `job_type`, and authoritative `remote` bool.
|
|
76
|
+
- **Greenhouse** — three layers of stale-protection (404 drop / past `application_deadline` / `first_published` age with 90-day default that respects `hours_old`).
|
|
77
|
+
|
|
78
|
+
### Bundled credentials
|
|
79
|
+
|
|
80
|
+
API keys for USAJobs, Adzuna, Jooble, Findwork, and The Muse are baked
|
|
81
|
+
into a positional resolver (`jobdrop/_defaults.py`) so the new sources
|
|
82
|
+
work without environment setup. User-set env vars still win via
|
|
83
|
+
`setdefault` semantics.
|
|
84
|
+
|
|
85
|
+
## Installation
|
|
86
|
+
|
|
87
|
+
```
|
|
88
|
+
pip install -U jobdrop
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
Python ≥ 3.10 required.
|
|
92
|
+
|
|
93
|
+
## Usage
|
|
94
|
+
|
|
95
|
+
```python
|
|
96
|
+
from jobdrop import scrape_jobs
|
|
97
|
+
|
|
98
|
+
jobs = scrape_jobs(
|
|
99
|
+
site_name=["insight_global", "clearance_jobs", "kforce", "greenhouse",
|
|
100
|
+
"linkedin", "indeed", "google"],
|
|
101
|
+
search_term="site reliability engineer",
|
|
102
|
+
location="Atlanta, GA",
|
|
103
|
+
results_wanted=20,
|
|
104
|
+
hours_old=720, # 30-day freshness cap
|
|
105
|
+
country_indeed="usa",
|
|
106
|
+
)
|
|
107
|
+
print(f"Found {len(jobs)} jobs")
|
|
108
|
+
print(jobs[["site", "title", "company", "location", "min_amount", "max_amount", "job_url"]].head())
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
## Parameters
|
|
112
|
+
|
|
113
|
+
```
|
|
114
|
+
scrape_jobs(
|
|
115
|
+
site_name list[str] | str — any of the 17 sources above (default: all)
|
|
116
|
+
search_term str — keyword query
|
|
117
|
+
google_search_term str — Google Jobs override (only filter for `google`)
|
|
118
|
+
location str — "City, ST" or ZIP. Each scraper geocodes its own way.
|
|
119
|
+
distance int — radius miles, default 50
|
|
120
|
+
is_remote bool — remote-only filter (where supported)
|
|
121
|
+
job_type str — "fulltime" | "parttime" | "contract" | "internship"
|
|
122
|
+
easy_apply bool — direct-board apply only (LinkedIn easy-apply is broken)
|
|
123
|
+
results_wanted int — per-site target
|
|
124
|
+
offset int — pagination offset
|
|
125
|
+
hours_old int — drop postings older than N hours
|
|
126
|
+
country_indeed str — Indeed/Glassdoor country (see list below)
|
|
127
|
+
description_format str — "markdown" | "html"
|
|
128
|
+
enforce_annual_salary bool — convert hourly/monthly to yearly
|
|
129
|
+
linkedin_fetch_description bool — full JD + direct URL (slower)
|
|
130
|
+
linkedin_company_ids list[int] — filter LinkedIn by company IDs
|
|
131
|
+
proxies list[str] — round-robin proxies, "user:pass@host:port"
|
|
132
|
+
ca_cert str — CA cert path for proxies
|
|
133
|
+
user_agent str — override the default UA
|
|
134
|
+
verbose int — 0 errors / 1 warnings / 2 all
|
|
135
|
+
)
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
### Per-scraper limitations
|
|
139
|
+
|
|
140
|
+
- **Indeed** — only one of `hours_old` / (`job_type`+`is_remote`) / `easy_apply` per call.
|
|
141
|
+
- **LinkedIn** — only one of `hours_old` / `easy_apply` per call.
|
|
142
|
+
- **ClearanceJobs** — location/remote filters require facet IDs from the dropdown endpoints (not implemented). Filter client-side or scope by keyword.
|
|
143
|
+
- **InsightGlobal** — does not expose client-company name (it's the staffing firm). `is_remote` is not available in their data.
|
|
144
|
+
- **Greenhouse** — Google indexes some postings after they're filled. Stale 404s are filtered out; the freshness cutoff filters "live but ancient" postings (default 90 days, override with `hours_old`).
|
|
145
|
+
|
|
146
|
+
## JobPost schema
|
|
147
|
+
|
|
148
|
+
```
|
|
149
|
+
JobPost
|
|
150
|
+
├── id, title, company_name, company_url, job_url
|
|
151
|
+
├── location { country, city, state }
|
|
152
|
+
├── description
|
|
153
|
+
├── is_remote
|
|
154
|
+
├── date_posted
|
|
155
|
+
├── job_type fulltime | parttime | contract | internship
|
|
156
|
+
├── compensation
|
|
157
|
+
│ ├── interval yearly | monthly | weekly | daily | hourly
|
|
158
|
+
│ ├── min_amount, max_amount, currency
|
|
159
|
+
│ └── salary_source
|
|
160
|
+
├── job_level (LinkedIn, ClearanceJobs)
|
|
161
|
+
├── company_industry (LinkedIn, Indeed, Greenhouse, Kforce)
|
|
162
|
+
├── company_country, company_addresses,
|
|
163
|
+
│ company_employees_label, company_revenue_label,
|
|
164
|
+
│ company_description, company_logo (Indeed)
|
|
165
|
+
├── skills, experience_range,
|
|
166
|
+
│ company_rating, company_reviews_count,
|
|
167
|
+
│ vacancy_count, work_from_home_type (Naukri)
|
|
168
|
+
└── emails
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
## Indeed / Glassdoor country list
|
|
172
|
+
|
|
173
|
+
Pass `country_indeed` (use the exact name; `*` = also supported on Glassdoor):
|
|
174
|
+
|
|
175
|
+
| | | | |
|
|
176
|
+
|---|---|---|---|
|
|
177
|
+
| Argentina | Australia* | Austria* | Bahrain |
|
|
178
|
+
| Belgium* | Brazil* | Canada* | Chile |
|
|
179
|
+
| China | Colombia | Costa Rica | Czech Republic |
|
|
180
|
+
| Denmark | Ecuador | Egypt | Finland |
|
|
181
|
+
| France* | Germany* | Greece | Hong Kong* |
|
|
182
|
+
| Hungary | India* | Indonesia | Ireland* |
|
|
183
|
+
| Israel | Italy* | Japan | Kuwait |
|
|
184
|
+
| Luxembourg | Malaysia | Mexico* | Morocco |
|
|
185
|
+
| Netherlands* | New Zealand* | Nigeria | Norway |
|
|
186
|
+
| Oman | Pakistan | Panama | Peru |
|
|
187
|
+
| Philippines | Poland | Portugal | Qatar |
|
|
188
|
+
| Romania | Saudi Arabia | Singapore* | South Africa |
|
|
189
|
+
| South Korea | Spain* | Sweden | Switzerland* |
|
|
190
|
+
| Taiwan | Thailand | Turkey | Ukraine |
|
|
191
|
+
| United Arab Emirates | UK* | USA* | Uruguay |
|
|
192
|
+
| Venezuela | Vietnam* | | |
|
|
193
|
+
|
|
194
|
+
LinkedIn searches globally and uses only `location`. ZipRecruiter is US/Canada and uses only `location`. Bayt searches internationally with only `search_term`.
|
|
195
|
+
|
|
196
|
+
## Notes
|
|
197
|
+
|
|
198
|
+
- Most boards cap a single search at ~1000 results.
|
|
199
|
+
- LinkedIn rate-limits aggressively around the 10th page of pagination on a single IP. Use `proxies`.
|
|
200
|
+
- For Indeed search-term tuning: it searches the description too. Use `-foo` to exclude, `"exact phrase"` for exact match. Example:
|
|
201
|
+
```python
|
|
202
|
+
search_term='"site reliability engineer" (kubernetes OR terraform) -recruiter'
|
|
203
|
+
```
|
|
204
|
+
- For Google: copy the exact filter syntax from a real Google Jobs search and pass it as `google_search_term`.
|
|
205
|
+
- For Greenhouse: keyword + location are passed straight to a Google `site:greenhouse.io` query, so Boolean operators and quotes work. Don't quote the full `"City, ST"` — quote the city alone, leave the state bare.
|
|
206
|
+
|
|
207
|
+
## License
|
|
208
|
+
|
|
209
|
+
MIT. See `LICENSE`.
|
|
210
|
+
|
jobdrop-2.0.0/README.md
ADDED
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
# jobdrop
|
|
2
|
+
|
|
3
|
+
A multi-source job scraper. Hits 20 job boards in one call, normalizes
|
|
4
|
+
the results into a pandas DataFrame, and ships with anti-bot bypasses
|
|
5
|
+
for the boards that block standard scrapers.
|
|
6
|
+
|
|
7
|
+
## What's in here
|
|
8
|
+
|
|
9
|
+
### 17 sources
|
|
10
|
+
|
|
11
|
+
| `site_name` | Source | Mechanism |
|
|
12
|
+
|---|---|---|
|
|
13
|
+
| `linkedin` | LinkedIn | Public listing scrape with optional detail-page enrichment |
|
|
14
|
+
| `indeed` | Indeed | GraphQL with the `Int!` radius fix + per-company cap + paginate-until-quota |
|
|
15
|
+
| `glassdoor` | Glassdoor | selenium-driverless headless to defeat Cloudflare 403; in-page GraphQL fetch |
|
|
16
|
+
| `google` | Google Jobs | selenium-driverless headless against `udm=8`; SERP DOM walk |
|
|
17
|
+
| `zip_recruiter` | ZipRecruiter | `curl_cffi` + `safari17_2_ios` TLS impersonation against the web HTML endpoint |
|
|
18
|
+
| `bayt` | Bayt | Public scrape |
|
|
19
|
+
| `naukri` | Naukri | Public scrape |
|
|
20
|
+
| `bdjobs` | BDJobs | Public scrape |
|
|
21
|
+
| `usajobs` | USAJobs.gov | Federal public API |
|
|
22
|
+
| `adzuna` | Adzuna | Public API |
|
|
23
|
+
| `jooble` | Jooble | Public API |
|
|
24
|
+
| `findwork` | Findwork.dev | Public API |
|
|
25
|
+
| `the_muse` | The Muse | Public API |
|
|
26
|
+
| `insight_global` | Insight Global staffing | Server-rendered HTML scrape with hidden JSON blob per result |
|
|
27
|
+
| `clearance_jobs` | ClearanceJobs (DHI) | Public JSON API + parallel detail-page enrichment for full JD, salary, type, remote bool |
|
|
28
|
+
| `kforce` | Kforce staffing | Direct Azure Cognitive Search calls (bypasses Imperva on the public host) |
|
|
29
|
+
| `greenhouse` | Greenhouse-hosted boards | Google `site:` dorks via selenium-driverless → public Greenhouse API; 3-layer staleness filter |
|
|
30
|
+
|
|
31
|
+
### Anti-bot solved
|
|
32
|
+
|
|
33
|
+
- **Google** — selenium-driverless cold-start headless. Defeats the 2026 CAPTCHA wall that takes out Playwright / undetected-chromedriver / nodriver / patchright.
|
|
34
|
+
- **Glassdoor** — selenium-driverless rewrite to bypass Cloudflare 403; URL-encoded location, partial-GraphQL-error tolerance.
|
|
35
|
+
- **ZipRecruiter** — `curl_cffi` + `safari17_2_ios` against the web HTML endpoint. The iOS-app API is dead behind Cloudflare.
|
|
36
|
+
- **Kforce** — bypasses Imperva on the public host by calling the Azure Cognitive Search backend directly.
|
|
37
|
+
- **Greenhouse** — uses the same selenium-driverless infrastructure as Google for `site:` dorks across all greenhouse-hosted boards.
|
|
38
|
+
|
|
39
|
+
### Other tightening
|
|
40
|
+
|
|
41
|
+
- **LinkedIn** — salary extraction from description body, optional per-company cap, parallel detail fetches.
|
|
42
|
+
- **Indeed** — fixed `radius=25` default after Indeed promoted the GraphQL field to `Int!`; per-company cap to surface diverse employers; pagination loop fixed.
|
|
43
|
+
- **ClearanceJobs** — search API gives a 200-char preview; this fork parallel-fetches `/api/v1/jobs/{id}` so you get the full JD, salary range, structured `job_type`, and authoritative `remote` bool.
|
|
44
|
+
- **Greenhouse** — three layers of stale-protection (404 drop / past `application_deadline` / `first_published` age with 90-day default that respects `hours_old`).
|
|
45
|
+
|
|
46
|
+
### Bundled credentials
|
|
47
|
+
|
|
48
|
+
API keys for USAJobs, Adzuna, Jooble, Findwork, and The Muse are baked
|
|
49
|
+
into a positional resolver (`jobdrop/_defaults.py`) so the new sources
|
|
50
|
+
work without environment setup. User-set env vars still win via
|
|
51
|
+
`setdefault` semantics.
|
|
52
|
+
|
|
53
|
+
## Installation
|
|
54
|
+
|
|
55
|
+
```
|
|
56
|
+
pip install -U jobdrop
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
Python ≥ 3.10 required.
|
|
60
|
+
|
|
61
|
+
## Usage
|
|
62
|
+
|
|
63
|
+
```python
|
|
64
|
+
from jobdrop import scrape_jobs
|
|
65
|
+
|
|
66
|
+
jobs = scrape_jobs(
|
|
67
|
+
site_name=["insight_global", "clearance_jobs", "kforce", "greenhouse",
|
|
68
|
+
"linkedin", "indeed", "google"],
|
|
69
|
+
search_term="site reliability engineer",
|
|
70
|
+
location="Atlanta, GA",
|
|
71
|
+
results_wanted=20,
|
|
72
|
+
hours_old=720, # 30-day freshness cap
|
|
73
|
+
country_indeed="usa",
|
|
74
|
+
)
|
|
75
|
+
print(f"Found {len(jobs)} jobs")
|
|
76
|
+
print(jobs[["site", "title", "company", "location", "min_amount", "max_amount", "job_url"]].head())
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
## Parameters
|
|
80
|
+
|
|
81
|
+
```
|
|
82
|
+
scrape_jobs(
|
|
83
|
+
site_name list[str] | str — any of the 17 sources above (default: all)
|
|
84
|
+
search_term str — keyword query
|
|
85
|
+
google_search_term str — Google Jobs override (only filter for `google`)
|
|
86
|
+
location str — "City, ST" or ZIP. Each scraper geocodes its own way.
|
|
87
|
+
distance int — radius miles, default 50
|
|
88
|
+
is_remote bool — remote-only filter (where supported)
|
|
89
|
+
job_type str — "fulltime" | "parttime" | "contract" | "internship"
|
|
90
|
+
easy_apply bool — direct-board apply only (LinkedIn easy-apply is broken)
|
|
91
|
+
results_wanted int — per-site target
|
|
92
|
+
offset int — pagination offset
|
|
93
|
+
hours_old int — drop postings older than N hours
|
|
94
|
+
country_indeed str — Indeed/Glassdoor country (see list below)
|
|
95
|
+
description_format str — "markdown" | "html"
|
|
96
|
+
enforce_annual_salary bool — convert hourly/monthly to yearly
|
|
97
|
+
linkedin_fetch_description bool — full JD + direct URL (slower)
|
|
98
|
+
linkedin_company_ids list[int] — filter LinkedIn by company IDs
|
|
99
|
+
proxies list[str] — round-robin proxies, "user:pass@host:port"
|
|
100
|
+
ca_cert str — CA cert path for proxies
|
|
101
|
+
user_agent str — override the default UA
|
|
102
|
+
verbose int — 0 errors / 1 warnings / 2 all
|
|
103
|
+
)
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
### Per-scraper limitations
|
|
107
|
+
|
|
108
|
+
- **Indeed** — only one of `hours_old` / (`job_type`+`is_remote`) / `easy_apply` per call.
|
|
109
|
+
- **LinkedIn** — only one of `hours_old` / `easy_apply` per call.
|
|
110
|
+
- **ClearanceJobs** — location/remote filters require facet IDs from the dropdown endpoints (not implemented). Filter client-side or scope by keyword.
|
|
111
|
+
- **InsightGlobal** — does not expose client-company name (it's the staffing firm). `is_remote` is not available in their data.
|
|
112
|
+
- **Greenhouse** — Google indexes some postings after they're filled. Stale 404s are filtered out; the freshness cutoff filters "live but ancient" postings (default 90 days, override with `hours_old`).
|
|
113
|
+
|
|
114
|
+
## JobPost schema
|
|
115
|
+
|
|
116
|
+
```
|
|
117
|
+
JobPost
|
|
118
|
+
├── id, title, company_name, company_url, job_url
|
|
119
|
+
├── location { country, city, state }
|
|
120
|
+
├── description
|
|
121
|
+
├── is_remote
|
|
122
|
+
├── date_posted
|
|
123
|
+
├── job_type fulltime | parttime | contract | internship
|
|
124
|
+
├── compensation
|
|
125
|
+
│ ├── interval yearly | monthly | weekly | daily | hourly
|
|
126
|
+
│ ├── min_amount, max_amount, currency
|
|
127
|
+
│ └── salary_source
|
|
128
|
+
├── job_level (LinkedIn, ClearanceJobs)
|
|
129
|
+
├── company_industry (LinkedIn, Indeed, Greenhouse, Kforce)
|
|
130
|
+
├── company_country, company_addresses,
|
|
131
|
+
│ company_employees_label, company_revenue_label,
|
|
132
|
+
│ company_description, company_logo (Indeed)
|
|
133
|
+
├── skills, experience_range,
|
|
134
|
+
│ company_rating, company_reviews_count,
|
|
135
|
+
│ vacancy_count, work_from_home_type (Naukri)
|
|
136
|
+
└── emails
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
## Indeed / Glassdoor country list
|
|
140
|
+
|
|
141
|
+
Pass `country_indeed` (use the exact name; `*` = also supported on Glassdoor):
|
|
142
|
+
|
|
143
|
+
| | | | |
|
|
144
|
+
|---|---|---|---|
|
|
145
|
+
| Argentina | Australia* | Austria* | Bahrain |
|
|
146
|
+
| Belgium* | Brazil* | Canada* | Chile |
|
|
147
|
+
| China | Colombia | Costa Rica | Czech Republic |
|
|
148
|
+
| Denmark | Ecuador | Egypt | Finland |
|
|
149
|
+
| France* | Germany* | Greece | Hong Kong* |
|
|
150
|
+
| Hungary | India* | Indonesia | Ireland* |
|
|
151
|
+
| Israel | Italy* | Japan | Kuwait |
|
|
152
|
+
| Luxembourg | Malaysia | Mexico* | Morocco |
|
|
153
|
+
| Netherlands* | New Zealand* | Nigeria | Norway |
|
|
154
|
+
| Oman | Pakistan | Panama | Peru |
|
|
155
|
+
| Philippines | Poland | Portugal | Qatar |
|
|
156
|
+
| Romania | Saudi Arabia | Singapore* | South Africa |
|
|
157
|
+
| South Korea | Spain* | Sweden | Switzerland* |
|
|
158
|
+
| Taiwan | Thailand | Turkey | Ukraine |
|
|
159
|
+
| United Arab Emirates | UK* | USA* | Uruguay |
|
|
160
|
+
| Venezuela | Vietnam* | | |
|
|
161
|
+
|
|
162
|
+
LinkedIn searches globally and uses only `location`. ZipRecruiter is US/Canada and uses only `location`. Bayt searches internationally with only `search_term`.
|
|
163
|
+
|
|
164
|
+
## Notes
|
|
165
|
+
|
|
166
|
+
- Most boards cap a single search at ~1000 results.
|
|
167
|
+
- LinkedIn rate-limits aggressively around the 10th page of pagination on a single IP. Use `proxies`.
|
|
168
|
+
- For Indeed search-term tuning: it searches the description too. Use `-foo` to exclude, `"exact phrase"` for exact match. Example:
|
|
169
|
+
```python
|
|
170
|
+
search_term='"site reliability engineer" (kubernetes OR terraform) -recruiter'
|
|
171
|
+
```
|
|
172
|
+
- For Google: copy the exact filter syntax from a real Google Jobs search and pass it as `google_search_term`.
|
|
173
|
+
- For Greenhouse: keyword + location are passed straight to a Google `site:greenhouse.io` query, so Boolean operators and quotes work. Don't quote the full `"City, ST"` — quote the city alone, leave the state bare.
|
|
174
|
+
|
|
175
|
+
## License
|
|
176
|
+
|
|
177
|
+
MIT. See `LICENSE`.
|
|
@@ -0,0 +1,256 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
# Apply compiled defaults BEFORE any scraper module runs its module-level
|
|
4
|
+
# os.environ reads. User-set env vars are preserved (setdefault semantics).
|
|
5
|
+
from jobdrop import _defaults # noqa: F401
|
|
6
|
+
|
|
7
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
8
|
+
from typing import Tuple
|
|
9
|
+
|
|
10
|
+
import pandas as pd
|
|
11
|
+
|
|
12
|
+
from jobdrop.adzuna import Adzuna
|
|
13
|
+
from jobdrop.bayt import BaytScraper
|
|
14
|
+
from jobdrop.bdjobs import BDJobs
|
|
15
|
+
from jobdrop.clearancejobs import ClearanceJobs
|
|
16
|
+
from jobdrop.collabwork import CollabWork
|
|
17
|
+
from jobdrop.findwork import Findwork
|
|
18
|
+
from jobdrop.glassdoor import Glassdoor
|
|
19
|
+
from jobdrop.google import Google
|
|
20
|
+
from jobdrop.greenhouse import Greenhouse
|
|
21
|
+
from jobdrop.hiring_cafe import HiringCafe
|
|
22
|
+
from jobdrop.indeed import Indeed
|
|
23
|
+
from jobdrop.insightglobal import InsightGlobal
|
|
24
|
+
from jobdrop.jooble import Jooble
|
|
25
|
+
from jobdrop.kforce import Kforce
|
|
26
|
+
from jobdrop.linkedin import LinkedIn
|
|
27
|
+
from jobdrop.naukri import Naukri
|
|
28
|
+
from jobdrop.the_muse import TheMuse
|
|
29
|
+
from jobdrop.usajobs import USAJobs
|
|
30
|
+
from jobdrop.wellfound import Wellfound
|
|
31
|
+
from jobdrop.model import JobType, Location, JobResponse, Country
|
|
32
|
+
from jobdrop.model import SalarySource, ScraperInput, Site
|
|
33
|
+
from jobdrop.util import (
|
|
34
|
+
set_logger_level,
|
|
35
|
+
extract_salary,
|
|
36
|
+
create_logger,
|
|
37
|
+
get_enum_from_value,
|
|
38
|
+
map_str_to_site,
|
|
39
|
+
convert_to_annual,
|
|
40
|
+
desired_order,
|
|
41
|
+
)
|
|
42
|
+
from jobdrop.ziprecruiter import ZipRecruiter
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
# Update the SCRAPER_MAPPING dictionary in the scrape_jobs function
|
|
46
|
+
|
|
47
|
+
def scrape_jobs(
|
|
48
|
+
site_name: str | list[str] | Site | list[Site] | None = None,
|
|
49
|
+
search_term: str | None = None,
|
|
50
|
+
google_search_term: str | None = None,
|
|
51
|
+
location: str | None = None,
|
|
52
|
+
distance: int | None = 50,
|
|
53
|
+
is_remote: bool = False,
|
|
54
|
+
job_type: str | None = None,
|
|
55
|
+
easy_apply: bool | None = None,
|
|
56
|
+
results_wanted: int = 15,
|
|
57
|
+
country_indeed: str = "usa",
|
|
58
|
+
proxies: list[str] | str | None = None,
|
|
59
|
+
ca_cert: str | None = None,
|
|
60
|
+
description_format: str = "markdown",
|
|
61
|
+
linkedin_fetch_description: bool | None = False,
|
|
62
|
+
linkedin_company_ids: list[int] | None = None,
|
|
63
|
+
offset: int | None = 0,
|
|
64
|
+
hours_old: int = None,
|
|
65
|
+
enforce_annual_salary: bool = False,
|
|
66
|
+
verbose: int = 0,
|
|
67
|
+
user_agent: str = None,
|
|
68
|
+
**kwargs,
|
|
69
|
+
) -> pd.DataFrame:
|
|
70
|
+
"""
|
|
71
|
+
Scrapes job data from job boards concurrently
|
|
72
|
+
:return: Pandas DataFrame containing job data
|
|
73
|
+
"""
|
|
74
|
+
SCRAPER_MAPPING = {
|
|
75
|
+
Site.LINKEDIN: LinkedIn,
|
|
76
|
+
Site.INDEED: Indeed,
|
|
77
|
+
Site.ZIP_RECRUITER: ZipRecruiter,
|
|
78
|
+
Site.GLASSDOOR: Glassdoor,
|
|
79
|
+
Site.GOOGLE: Google,
|
|
80
|
+
Site.BAYT: BaytScraper,
|
|
81
|
+
Site.NAUKRI: Naukri,
|
|
82
|
+
Site.BDJOBS: BDJobs,
|
|
83
|
+
# API-based sources added in the kbwhodat fork
|
|
84
|
+
Site.USAJOBS: USAJobs,
|
|
85
|
+
Site.ADZUNA: Adzuna,
|
|
86
|
+
Site.JOOBLE: Jooble,
|
|
87
|
+
Site.FINDWORK: Findwork,
|
|
88
|
+
Site.THE_MUSE: TheMuse,
|
|
89
|
+
Site.INSIGHT_GLOBAL: InsightGlobal,
|
|
90
|
+
Site.CLEARANCE_JOBS: ClearanceJobs,
|
|
91
|
+
Site.KFORCE: Kforce,
|
|
92
|
+
Site.GREENHOUSE: Greenhouse,
|
|
93
|
+
Site.COLLAB_WORK: CollabWork,
|
|
94
|
+
Site.WELLFOUND: Wellfound,
|
|
95
|
+
Site.HIRING_CAFE: HiringCafe,
|
|
96
|
+
}
|
|
97
|
+
set_logger_level(verbose)
|
|
98
|
+
job_type = get_enum_from_value(job_type) if job_type else None
|
|
99
|
+
|
|
100
|
+
def get_site_type():
|
|
101
|
+
site_types = list(Site)
|
|
102
|
+
if isinstance(site_name, str):
|
|
103
|
+
site_types = [map_str_to_site(site_name)]
|
|
104
|
+
elif isinstance(site_name, Site):
|
|
105
|
+
site_types = [site_name]
|
|
106
|
+
elif isinstance(site_name, list):
|
|
107
|
+
site_types = [
|
|
108
|
+
map_str_to_site(site) if isinstance(site, str) else site
|
|
109
|
+
for site in site_name
|
|
110
|
+
]
|
|
111
|
+
return site_types
|
|
112
|
+
|
|
113
|
+
country_enum = Country.from_string(country_indeed)
|
|
114
|
+
|
|
115
|
+
scraper_input = ScraperInput(
|
|
116
|
+
site_type=get_site_type(),
|
|
117
|
+
country=country_enum,
|
|
118
|
+
search_term=search_term,
|
|
119
|
+
google_search_term=google_search_term,
|
|
120
|
+
location=location,
|
|
121
|
+
distance=distance,
|
|
122
|
+
is_remote=is_remote,
|
|
123
|
+
job_type=job_type,
|
|
124
|
+
easy_apply=easy_apply,
|
|
125
|
+
description_format=description_format,
|
|
126
|
+
linkedin_fetch_description=linkedin_fetch_description,
|
|
127
|
+
results_wanted=results_wanted,
|
|
128
|
+
linkedin_company_ids=linkedin_company_ids,
|
|
129
|
+
offset=offset,
|
|
130
|
+
hours_old=hours_old,
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
def scrape_site(site: Site) -> Tuple[str, JobResponse]:
|
|
134
|
+
scraper_class = SCRAPER_MAPPING[site]
|
|
135
|
+
scraper = scraper_class(proxies=proxies, ca_cert=ca_cert, user_agent=user_agent)
|
|
136
|
+
scraped_data: JobResponse = scraper.scrape(scraper_input)
|
|
137
|
+
cap_name = site.value.capitalize()
|
|
138
|
+
site_name = "ZipRecruiter" if cap_name == "Zip_recruiter" else cap_name
|
|
139
|
+
site_name = "LinkedIn" if cap_name == "Linkedin" else cap_name
|
|
140
|
+
create_logger(site_name).info(f"finished scraping")
|
|
141
|
+
return site.value, scraped_data
|
|
142
|
+
|
|
143
|
+
site_to_jobs_dict = {}
|
|
144
|
+
|
|
145
|
+
def worker(site):
|
|
146
|
+
site_val, scraped_info = scrape_site(site)
|
|
147
|
+
return site_val, scraped_info
|
|
148
|
+
|
|
149
|
+
with ThreadPoolExecutor() as executor:
|
|
150
|
+
future_to_site = {
|
|
151
|
+
executor.submit(worker, site): site for site in scraper_input.site_type
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
for future in as_completed(future_to_site):
|
|
155
|
+
site_value, scraped_data = future.result()
|
|
156
|
+
site_to_jobs_dict[site_value] = scraped_data
|
|
157
|
+
|
|
158
|
+
jobs_dfs: list[pd.DataFrame] = []
|
|
159
|
+
|
|
160
|
+
for site, job_response in site_to_jobs_dict.items():
|
|
161
|
+
for job in job_response.jobs:
|
|
162
|
+
job_data = job.dict()
|
|
163
|
+
job_url = job_data["job_url"]
|
|
164
|
+
job_data["site"] = site
|
|
165
|
+
job_data["company"] = job_data["company_name"]
|
|
166
|
+
job_data["job_type"] = (
|
|
167
|
+
", ".join(job_type.value[0] for job_type in job_data["job_type"])
|
|
168
|
+
if job_data["job_type"]
|
|
169
|
+
else None
|
|
170
|
+
)
|
|
171
|
+
job_data["emails"] = (
|
|
172
|
+
", ".join(job_data["emails"]) if job_data["emails"] else None
|
|
173
|
+
)
|
|
174
|
+
if job_data["location"]:
|
|
175
|
+
job_data["location"] = Location(
|
|
176
|
+
**job_data["location"]
|
|
177
|
+
).display_location()
|
|
178
|
+
|
|
179
|
+
# Handle compensation
|
|
180
|
+
compensation_obj = job_data.get("compensation")
|
|
181
|
+
if compensation_obj and isinstance(compensation_obj, dict):
|
|
182
|
+
job_data["interval"] = (
|
|
183
|
+
compensation_obj.get("interval").value
|
|
184
|
+
if compensation_obj.get("interval")
|
|
185
|
+
else None
|
|
186
|
+
)
|
|
187
|
+
job_data["min_amount"] = compensation_obj.get("min_amount")
|
|
188
|
+
job_data["max_amount"] = compensation_obj.get("max_amount")
|
|
189
|
+
job_data["currency"] = compensation_obj.get("currency", "USD")
|
|
190
|
+
job_data["salary_source"] = SalarySource.DIRECT_DATA.value
|
|
191
|
+
if enforce_annual_salary and (
|
|
192
|
+
job_data["interval"]
|
|
193
|
+
and job_data["interval"] != "yearly"
|
|
194
|
+
and job_data["min_amount"]
|
|
195
|
+
and job_data["max_amount"]
|
|
196
|
+
):
|
|
197
|
+
convert_to_annual(job_data)
|
|
198
|
+
else:
|
|
199
|
+
if country_enum == Country.USA:
|
|
200
|
+
(
|
|
201
|
+
job_data["interval"],
|
|
202
|
+
job_data["min_amount"],
|
|
203
|
+
job_data["max_amount"],
|
|
204
|
+
job_data["currency"],
|
|
205
|
+
) = extract_salary(
|
|
206
|
+
job_data["description"],
|
|
207
|
+
enforce_annual_salary=enforce_annual_salary,
|
|
208
|
+
)
|
|
209
|
+
job_data["salary_source"] = SalarySource.DESCRIPTION.value
|
|
210
|
+
|
|
211
|
+
job_data["salary_source"] = (
|
|
212
|
+
job_data["salary_source"]
|
|
213
|
+
if "min_amount" in job_data and job_data["min_amount"]
|
|
214
|
+
else None
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
#naukri-specific fields
|
|
218
|
+
job_data["skills"] = (
|
|
219
|
+
", ".join(job_data["skills"]) if job_data["skills"] else None
|
|
220
|
+
)
|
|
221
|
+
job_data["experience_range"] = job_data.get("experience_range")
|
|
222
|
+
job_data["company_rating"] = job_data.get("company_rating")
|
|
223
|
+
job_data["company_reviews_count"] = job_data.get("company_reviews_count")
|
|
224
|
+
job_data["vacancy_count"] = job_data.get("vacancy_count")
|
|
225
|
+
job_data["work_from_home_type"] = job_data.get("work_from_home_type")
|
|
226
|
+
|
|
227
|
+
job_df = pd.DataFrame([job_data])
|
|
228
|
+
jobs_dfs.append(job_df)
|
|
229
|
+
|
|
230
|
+
if jobs_dfs:
|
|
231
|
+
# Step 1: Filter out all-NA columns from each DataFrame before concatenation
|
|
232
|
+
filtered_dfs = [df.dropna(axis=1, how="all") for df in jobs_dfs]
|
|
233
|
+
|
|
234
|
+
# Step 2: Concatenate the filtered DataFrames
|
|
235
|
+
jobs_df = pd.concat(filtered_dfs, ignore_index=True)
|
|
236
|
+
|
|
237
|
+
# Step 3: Ensure all desired columns are present, adding missing ones as empty
|
|
238
|
+
for column in desired_order:
|
|
239
|
+
if column not in jobs_df.columns:
|
|
240
|
+
jobs_df[column] = None # Add missing columns as empty
|
|
241
|
+
|
|
242
|
+
# Reorder the DataFrame according to the desired order
|
|
243
|
+
jobs_df = jobs_df[desired_order]
|
|
244
|
+
|
|
245
|
+
# Step 4: Sort the DataFrame as required
|
|
246
|
+
return jobs_df.sort_values(
|
|
247
|
+
by=["site", "date_posted"], ascending=[True, False]
|
|
248
|
+
).reset_index(drop=True)
|
|
249
|
+
else:
|
|
250
|
+
return pd.DataFrame()
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
# Add BDJobs to __all__
|
|
254
|
+
__all__ = [
|
|
255
|
+
"BDJobs",
|
|
256
|
+
]
|