hirehunt 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hirehunt-0.2.0/PKG-INFO +323 -0
- hirehunt-0.2.0/README.md +305 -0
- hirehunt-0.2.0/hirehunt.egg-info/PKG-INFO +323 -0
- hirehunt-0.2.0/hirehunt.egg-info/SOURCES.txt +42 -0
- hirehunt-0.2.0/hirehunt.egg-info/dependency_links.txt +1 -0
- hirehunt-0.2.0/hirehunt.egg-info/entry_points.txt +2 -0
- hirehunt-0.2.0/hirehunt.egg-info/requires.txt +12 -0
- hirehunt-0.2.0/hirehunt.egg-info/top_level.txt +1 -0
- hirehunt-0.2.0/jobhunter/__init__.py +20 -0
- hirehunt-0.2.0/jobhunter/cli.py +155 -0
- hirehunt-0.2.0/jobhunter/engine.py +112 -0
- hirehunt-0.2.0/jobhunter/exceptions.py +21 -0
- hirehunt-0.2.0/jobhunter/exporters/__init__.py +1 -0
- hirehunt-0.2.0/jobhunter/exporters/csv.py +19 -0
- hirehunt-0.2.0/jobhunter/exporters/dataframe.py +11 -0
- hirehunt-0.2.0/jobhunter/exporters/json.py +12 -0
- hirehunt-0.2.0/jobhunter/filtering.py +117 -0
- hirehunt-0.2.0/jobhunter/models.py +131 -0
- hirehunt-0.2.0/jobhunter/query.py +84 -0
- hirehunt-0.2.0/jobhunter/ranking.py +132 -0
- hirehunt-0.2.0/jobhunter/registry.py +46 -0
- hirehunt-0.2.0/jobhunter/scrapers/__init__.py +34 -0
- hirehunt-0.2.0/jobhunter/scrapers/base.py +45 -0
- hirehunt-0.2.0/jobhunter/scrapers/faang.py +266 -0
- hirehunt-0.2.0/jobhunter/scrapers/indeed.py +339 -0
- hirehunt-0.2.0/jobhunter/scrapers/internshala.py +198 -0
- hirehunt-0.2.0/jobhunter/scrapers/linkedin.py +72 -0
- hirehunt-0.2.0/jobhunter/scrapers/naukri.py +265 -0
- hirehunt-0.2.0/jobhunter/scrapers/shine.py +247 -0
- hirehunt-0.2.0/jobhunter/scrapers/unstop.py +204 -0
- hirehunt-0.2.0/jobhunter/utils/__init__.py +1 -0
- hirehunt-0.2.0/jobhunter/utils/cache.py +50 -0
- hirehunt-0.2.0/jobhunter/utils/dedupe.py +38 -0
- hirehunt-0.2.0/jobhunter/utils/fetchers.py +110 -0
- hirehunt-0.2.0/jobhunter/utils/http.py +65 -0
- hirehunt-0.2.0/jobhunter/utils/normalization.py +187 -0
- hirehunt-0.2.0/jobhunter/validation.py +112 -0
- hirehunt-0.2.0/pyproject.toml +29 -0
- hirehunt-0.2.0/setup.cfg +4 -0
- hirehunt-0.2.0/tests/test_dedupe_filter_rank.py +41 -0
- hirehunt-0.2.0/tests/test_filter_matrix.py +103 -0
- hirehunt-0.2.0/tests/test_normalization.py +29 -0
- hirehunt-0.2.0/tests/test_parsers.py +100 -0
- hirehunt-0.2.0/tests/test_v02_features.py +81 -0
hirehunt-0.2.0/PKG-INFO
ADDED
|
@@ -0,0 +1,323 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: hirehunt
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: A programmable job-scraping framework for India & global markets. Aggregates Naukri, Shine, Internshala, LinkedIn, Indeed, and FAANG companies into a unified dataset.
|
|
5
|
+
Requires-Python: >=3.10
|
|
6
|
+
Description-Content-Type: text/markdown
|
|
7
|
+
Requires-Dist: beautifulsoup4>=4.12.0
|
|
8
|
+
Requires-Dist: lxml>=4.9.0
|
|
9
|
+
Requires-Dist: pandas>=2.0.0
|
|
10
|
+
Requires-Dist: python-dateutil>=2.8.0
|
|
11
|
+
Requires-Dist: python-dotenv>=1.0.0
|
|
12
|
+
Requires-Dist: requests>=2.31.0
|
|
13
|
+
Requires-Dist: rich>=13.0.0
|
|
14
|
+
Requires-Dist: urllib3>=2.0.0
|
|
15
|
+
Provides-Extra: dev
|
|
16
|
+
Requires-Dist: build>=1.2.0; extra == "dev"
|
|
17
|
+
Requires-Dist: twine>=5.0.0; extra == "dev"
|
|
18
|
+
|
|
19
|
+
# ๐ฏ HireHunt
|
|
20
|
+
|
|
21
|
+
**A programmable job-scraping framework for India & global markets.**
|
|
22
|
+
Aggregate jobs from **12 sources** โ Naukri, Internshala, Shine, LinkedIn, Indeed, and FAANG companies โ into a unified, filterable, ranked dataset.
|
|
23
|
+
|
|
24
|
+
---
|
|
25
|
+
|
|
26
|
+
## โจ Sources
|
|
27
|
+
|
|
28
|
+
| Source | Region | Type | Method |
|
|
29
|
+
|---|---|---|---|
|
|
30
|
+
| `naukri` | ๐ฎ๐ณ India | Jobs | REST API โ 15,000+ listings |
|
|
31
|
+
| `shine` | ๐ฎ๐ณ India | Jobs | SSR JSON โ 17,000+ listings |
|
|
32
|
+
| `internshala` | ๐ฎ๐ณ India | Internships / Jobs | HTML scraping |
|
|
33
|
+
| `unstop` | ๐ฎ๐ณ India | Hackathons / Competitions | REST API |
|
|
34
|
+
| `linkedin` | ๐ Global | Jobs | Guest HTML API |
|
|
35
|
+
| `indeed` | ๐ Global | Jobs | GraphQL API |
|
|
36
|
+
| `google_careers` | ๐ FAANG | Jobs | LinkedIn (company-filtered) |
|
|
37
|
+
| `amazon` | ๐ FAANG | Jobs | REST API |
|
|
38
|
+
| `meta` | ๐ FAANG | Jobs | LinkedIn (company-filtered) |
|
|
39
|
+
| `apple` | ๐ FAANG | Jobs | LinkedIn (keyword search) |
|
|
40
|
+
| `netflix` | ๐ FAANG | Jobs | LinkedIn (company-filtered) |
|
|
41
|
+
| `microsoft` | ๐ FAANG | Jobs | LinkedIn (company-filtered) |
|
|
42
|
+
|
|
43
|
+
---
|
|
44
|
+
|
|
45
|
+
## ๐ฆ Installation
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
pip install hirehunt
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
> **Note:** The PyPI package is `hirehunt`. The import name is `jobhunter`.
|
|
52
|
+
> ```python
|
|
53
|
+
> import jobhunter # โ this is correct after pip install hirehunt
|
|
54
|
+
> ```
|
|
55
|
+
|
|
56
|
+
**Requirements:** Python 3.10+
|
|
57
|
+
|
|
58
|
+
---
|
|
59
|
+
|
|
60
|
+
## โก Quick Start
|
|
61
|
+
|
|
62
|
+
### Python API
|
|
63
|
+
|
|
64
|
+
```python
|
|
65
|
+
from jobhunter import scrape_jobs
|
|
66
|
+
|
|
67
|
+
# Search across India's top job boards
|
|
68
|
+
jobs = scrape_jobs(
|
|
69
|
+
search_term="python developer",
|
|
70
|
+
sources=["naukri", "shine", "internshala"],
|
|
71
|
+
city="Bengaluru",
|
|
72
|
+
results_wanted=50,
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
for job in jobs:
|
|
76
|
+
print(job)
|
|
77
|
+
# Python Developer @ TCS | Bengaluru | naukri
|
|
78
|
+
# Python Developer @ Infosys | Bengaluru | shine
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
### CLI
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
# India job search
|
|
85
|
+
jobhunter search "data scientist" --city Mumbai --sources naukri,shine
|
|
86
|
+
|
|
87
|
+
# Hackathons & competitions
|
|
88
|
+
jobhunter search "hackathon" --sources unstop
|
|
89
|
+
|
|
90
|
+
# FAANG company jobs
|
|
91
|
+
jobhunter search "software engineer" --sources google_careers,amazon,netflix
|
|
92
|
+
|
|
93
|
+
# Export to CSV
|
|
94
|
+
jobhunter search "backend developer" --sources naukri,linkedin --output jobs.csv
|
|
95
|
+
|
|
96
|
+
# Top 20 ranked results
|
|
97
|
+
jobhunter search "machine learning" --sources naukri,shine,linkedin --top 20
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
---
|
|
101
|
+
|
|
102
|
+
## ๐ง Python API Reference
|
|
103
|
+
|
|
104
|
+
### `scrape_jobs()`
|
|
105
|
+
|
|
106
|
+
```python
|
|
107
|
+
from jobhunter import scrape_jobs
|
|
108
|
+
|
|
109
|
+
jobs = scrape_jobs(
|
|
110
|
+
search_term="python developer", # What to search
|
|
111
|
+
sources=["naukri", "shine"], # Which sources (list or "auto")
|
|
112
|
+
city="Bengaluru", # City filter (optional)
|
|
113
|
+
location="India", # Broader location (optional)
|
|
114
|
+
country="India", # Country (optional)
|
|
115
|
+
results_wanted=50, # Max results per source
|
|
116
|
+
job_kind="job", # "job", "internship", "hackathon"
|
|
117
|
+
remote=None, # True = remote only
|
|
118
|
+
salary_min=500000, # Min salary in INR (optional)
|
|
119
|
+
posted_within_days=30, # Only jobs from last N days
|
|
120
|
+
skills=["python", "django"], # Skill filter (optional)
|
|
121
|
+
experience_min=0, # Min years experience (optional)
|
|
122
|
+
experience_max=5, # Max years experience (optional)
|
|
123
|
+
)
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
### `Job` Object
|
|
127
|
+
|
|
128
|
+
Every source returns the same normalized `Job` dataclass:
|
|
129
|
+
|
|
130
|
+
```python
|
|
131
|
+
@dataclass
|
|
132
|
+
class Job:
|
|
133
|
+
title: str
|
|
134
|
+
company: str
|
|
135
|
+
source: str
|
|
136
|
+
job_url: str
|
|
137
|
+
|
|
138
|
+
location: str
|
|
139
|
+
city: str
|
|
140
|
+
country: str
|
|
141
|
+
work_mode: WorkMode # "remote" | "hybrid" | "onsite" | "unknown"
|
|
142
|
+
job_kind: JobKind # "job" | "internship" | "hackathon" | "competition"
|
|
143
|
+
|
|
144
|
+
salary: Money # min_amount, max_amount, currency, period
|
|
145
|
+
stipend: Money
|
|
146
|
+
|
|
147
|
+
skills: list[str]
|
|
148
|
+
experience_min: float | None
|
|
149
|
+
experience_max: float | None
|
|
150
|
+
description: str
|
|
151
|
+
date_posted: str | None
|
|
152
|
+
deadline: str | None # for competitions/hackathons
|
|
153
|
+
|
|
154
|
+
match_score: float # 0.0โ1.0 after ranking
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
### Export
|
|
158
|
+
|
|
159
|
+
```python
|
|
160
|
+
from jobhunter import scrape_jobs
|
|
161
|
+
from jobhunter.exporters import to_csv, to_json, to_dataframe
|
|
162
|
+
|
|
163
|
+
jobs = scrape_jobs("python developer", sources=["naukri", "shine"])
|
|
164
|
+
|
|
165
|
+
to_csv(jobs, "jobs.csv")
|
|
166
|
+
to_json(jobs, "jobs.json")
|
|
167
|
+
df = to_dataframe(jobs) # pandas DataFrame
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
---
|
|
171
|
+
|
|
172
|
+
## ๐๏ธ Project Structure
|
|
173
|
+
|
|
174
|
+
```
|
|
175
|
+
jobhunter/
|
|
176
|
+
โโโ __init__.py # scrape_jobs() entry point
|
|
177
|
+
โโโ models.py # Job, Money, WorkMode, JobKind dataclasses
|
|
178
|
+
โโโ query.py # JobQuery โ unified search parameters
|
|
179
|
+
โโโ engine.py # Orchestrates parallel scraping + dedup
|
|
180
|
+
โโโ registry.py # Scraper registry + auto-source selection
|
|
181
|
+
โโโ filtering.py # Soft filtering (salary, city, skills, date)
|
|
182
|
+
โโโ ranking.py # Relevance scoring / match_score
|
|
183
|
+
โโโ validation.py # Input validation
|
|
184
|
+
โโโ exceptions.py # Custom exceptions
|
|
185
|
+
โโโ cli.py # `jobhunter` CLI entry point
|
|
186
|
+
โ
|
|
187
|
+
โโโ scrapers/
|
|
188
|
+
โ โโโ base.py # BaseScraper ABC
|
|
189
|
+
โ โโโ naukri.py # ๐ฎ๐ณ Naukri โ /jobapi/v2/search REST API
|
|
190
|
+
โ โโโ shine.py # ๐ฎ๐ณ Shine โ __NEXT_DATA__ SSR JSON
|
|
191
|
+
โ โโโ internshala.py # ๐ฎ๐ณ Internshala โ HTML + pagination
|
|
192
|
+
โ โโโ unstop.py # ๐ฎ๐ณ Unstop โ hackathons REST API
|
|
193
|
+
โ โโโ linkedin.py # ๐ LinkedIn โ guest HTML API
|
|
194
|
+
โ โโโ indeed.py # ๐ Indeed โ GraphQL API
|
|
195
|
+
โ โโโ faang.py # ๐ Google, Amazon, Meta, Apple, Netflix, Microsoft
|
|
196
|
+
โ
|
|
197
|
+
โโโ exporters/
|
|
198
|
+
โ โโโ csv_exporter.py
|
|
199
|
+
โ โโโ json_exporter.py
|
|
200
|
+
โ โโโ dataframe.py
|
|
201
|
+
โ
|
|
202
|
+
โโโ utils/
|
|
203
|
+
โโโ fetchers.py # CachedFetcher with proxy + backend support
|
|
204
|
+
โโโ normalization.py # clean_text, parse_money, normalize_city, ...
|
|
205
|
+
|
|
206
|
+
tests/
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
---
|
|
210
|
+
|
|
211
|
+
## ๐ Source Details
|
|
212
|
+
|
|
213
|
+
### ๐ฎ๐ณ Naukri
|
|
214
|
+
- **Endpoint:** `GET https://www.naukri.com/jobapi/v2/search`
|
|
215
|
+
- **Auth:** Session cookies from page warm-up (automatic)
|
|
216
|
+
- **Fields:** Title, company, salary (LPA), location, skills, experience, date
|
|
217
|
+
- **Pagination:** `pageNo=N`, 20 results/page, 3,000+ pages available
|
|
218
|
+
|
|
219
|
+
### ๐ฎ๐ณ Shine
|
|
220
|
+
- **Endpoint:** `__NEXT_DATA__` SSR JSON embedded in HTML
|
|
221
|
+
- **Fields:** `jJT` (title), `jCName` (company), `jSal` (salary), `jLoc` (location), `jKwd` (skills), `jPDate` (date), `jSlug` (URL)
|
|
222
|
+
- **Pagination:** `?page=N`, 20 results/page, 900+ pages
|
|
223
|
+
|
|
224
|
+
### ๐ฎ๐ณ Internshala
|
|
225
|
+
- **Endpoint:** HTML scraping โ `div[id^='individual_internship_'][internshipid]`
|
|
226
|
+
- **Pagination:** `?page=N`, 40+ cards/page
|
|
227
|
+
- **City filter:** URL slug e.g. `/internships/python-intern-in-bengaluru/`
|
|
228
|
+
|
|
229
|
+
### ๐ฎ๐ณ Unstop
|
|
230
|
+
- **Endpoint:** `GET https://unstop.com/api/public/opportunity/search-result`
|
|
231
|
+
- **Note:** Returns hackathons, coding competitions, and challenges only
|
|
232
|
+
- **Fields:** Title, organisation, skills, location, deadline, prize
|
|
233
|
+
|
|
234
|
+
### ๐ Indeed
|
|
235
|
+
- **Endpoint:** `POST https://apis.indeed.com/graphql`
|
|
236
|
+
- **Auth:** Public API key (included)
|
|
237
|
+
- **Pagination:** Cursor-based
|
|
238
|
+
|
|
239
|
+
### ๐ LinkedIn
|
|
240
|
+
- **Endpoint:** `GET https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search`
|
|
241
|
+
- **Auth:** None โ guest API
|
|
242
|
+
- **FAANG filter:** `f_C` company ID parameter
|
|
243
|
+
|
|
244
|
+
### ๐ Amazon
|
|
245
|
+
- **Endpoint:** `GET https://www.amazon.jobs/en/search.json`
|
|
246
|
+
- **Auth:** None โ public REST API
|
|
247
|
+
|
|
248
|
+
---
|
|
249
|
+
|
|
250
|
+
## โ๏ธ Filtering
|
|
251
|
+
|
|
252
|
+
Filters are **soft by default** โ jobs missing a field pass through rather than being dropped:
|
|
253
|
+
|
|
254
|
+
```python
|
|
255
|
+
jobs = scrape_jobs(
|
|
256
|
+
"python developer",
|
|
257
|
+
sources=["naukri", "shine"],
|
|
258
|
+
salary_min=600_000, # Only applied if salary data exists
|
|
259
|
+
city="Bengaluru", # Only applied if location data exists
|
|
260
|
+
skills=["python", "sql"], # Only applied if skills data exists
|
|
261
|
+
posted_within_days=14, # Only applied if date data exists
|
|
262
|
+
)
|
|
263
|
+
```
|
|
264
|
+
|
|
265
|
+
---
|
|
266
|
+
|
|
267
|
+
## ๐ Advanced Usage
|
|
268
|
+
|
|
269
|
+
### FAANG-only search
|
|
270
|
+
|
|
271
|
+
```python
|
|
272
|
+
from jobhunter import scrape_jobs
|
|
273
|
+
from jobhunter.registry import default_registry
|
|
274
|
+
|
|
275
|
+
registry = default_registry()
|
|
276
|
+
faang = registry.faang_sources() # ['google_careers', 'amazon', 'meta', 'apple', 'netflix', 'microsoft']
|
|
277
|
+
|
|
278
|
+
jobs = scrape_jobs(
|
|
279
|
+
search_term="software engineer",
|
|
280
|
+
sources=faang,
|
|
281
|
+
results_wanted=20,
|
|
282
|
+
)
|
|
283
|
+
```
|
|
284
|
+
|
|
285
|
+
### Parallel scraping with custom config
|
|
286
|
+
|
|
287
|
+
```python
|
|
288
|
+
jobs = scrape_jobs(
|
|
289
|
+
search_term="backend developer",
|
|
290
|
+
sources=["naukri", "shine", "linkedin"],
|
|
291
|
+
city="Hyderabad",
|
|
292
|
+
results_wanted=100,
|
|
293
|
+
posted_within_days=7,
|
|
294
|
+
cache_enabled=True, # Cache responses locally
|
|
295
|
+
proxies=["http://..."], # Optional proxy list
|
|
296
|
+
)
|
|
297
|
+
```
|
|
298
|
+
|
|
299
|
+
### Auto-source selection
|
|
300
|
+
|
|
301
|
+
```python
|
|
302
|
+
# Automatically picks India sources when country="India"
|
|
303
|
+
jobs = scrape_jobs(
|
|
304
|
+
search_term="python developer",
|
|
305
|
+
country="India",
|
|
306
|
+
sources="auto", # โ [indeed, linkedin, internshala, naukri, shine, unstop]
|
|
307
|
+
)
|
|
308
|
+
```
|
|
309
|
+
|
|
310
|
+
---
|
|
311
|
+
|
|
312
|
+
## ๐งช Running Tests
|
|
313
|
+
|
|
314
|
+
```bash
|
|
315
|
+
pip install -e .
|
|
316
|
+
pytest tests/
|
|
317
|
+
```
|
|
318
|
+
|
|
319
|
+
---
|
|
320
|
+
|
|
321
|
+
## ๐ License
|
|
322
|
+
|
|
323
|
+
MIT
|
hirehunt-0.2.0/README.md
ADDED
|
@@ -0,0 +1,305 @@
|
|
|
1
|
+
# ๐ฏ HireHunt
|
|
2
|
+
|
|
3
|
+
**A programmable job-scraping framework for India & global markets.**
|
|
4
|
+
Aggregate jobs from **12 sources** โ Naukri, Internshala, Shine, LinkedIn, Indeed, and FAANG companies โ into a unified, filterable, ranked dataset.
|
|
5
|
+
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
## โจ Sources
|
|
9
|
+
|
|
10
|
+
| Source | Region | Type | Method |
|
|
11
|
+
|---|---|---|---|
|
|
12
|
+
| `naukri` | ๐ฎ๐ณ India | Jobs | REST API โ 15,000+ listings |
|
|
13
|
+
| `shine` | ๐ฎ๐ณ India | Jobs | SSR JSON โ 17,000+ listings |
|
|
14
|
+
| `internshala` | ๐ฎ๐ณ India | Internships / Jobs | HTML scraping |
|
|
15
|
+
| `unstop` | ๐ฎ๐ณ India | Hackathons / Competitions | REST API |
|
|
16
|
+
| `linkedin` | ๐ Global | Jobs | Guest HTML API |
|
|
17
|
+
| `indeed` | ๐ Global | Jobs | GraphQL API |
|
|
18
|
+
| `google_careers` | ๐ FAANG | Jobs | LinkedIn (company-filtered) |
|
|
19
|
+
| `amazon` | ๐ FAANG | Jobs | REST API |
|
|
20
|
+
| `meta` | ๐ FAANG | Jobs | LinkedIn (company-filtered) |
|
|
21
|
+
| `apple` | ๐ FAANG | Jobs | LinkedIn (keyword search) |
|
|
22
|
+
| `netflix` | ๐ FAANG | Jobs | LinkedIn (company-filtered) |
|
|
23
|
+
| `microsoft` | ๐ FAANG | Jobs | LinkedIn (company-filtered) |
|
|
24
|
+
|
|
25
|
+
---
|
|
26
|
+
|
|
27
|
+
## ๐ฆ Installation
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
pip install hirehunt
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
> **Note:** The PyPI package is `hirehunt`. The import name is `jobhunter`.
|
|
34
|
+
> ```python
|
|
35
|
+
> import jobhunter # โ this is correct after pip install hirehunt
|
|
36
|
+
> ```
|
|
37
|
+
|
|
38
|
+
**Requirements:** Python 3.10+
|
|
39
|
+
|
|
40
|
+
---
|
|
41
|
+
|
|
42
|
+
## โก Quick Start
|
|
43
|
+
|
|
44
|
+
### Python API
|
|
45
|
+
|
|
46
|
+
```python
|
|
47
|
+
from jobhunter import scrape_jobs
|
|
48
|
+
|
|
49
|
+
# Search across India's top job boards
|
|
50
|
+
jobs = scrape_jobs(
|
|
51
|
+
search_term="python developer",
|
|
52
|
+
sources=["naukri", "shine", "internshala"],
|
|
53
|
+
city="Bengaluru",
|
|
54
|
+
results_wanted=50,
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
for job in jobs:
|
|
58
|
+
print(job)
|
|
59
|
+
# Python Developer @ TCS | Bengaluru | naukri
|
|
60
|
+
# Python Developer @ Infosys | Bengaluru | shine
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
### CLI
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
# India job search
|
|
67
|
+
jobhunter search "data scientist" --city Mumbai --sources naukri,shine
|
|
68
|
+
|
|
69
|
+
# Hackathons & competitions
|
|
70
|
+
jobhunter search "hackathon" --sources unstop
|
|
71
|
+
|
|
72
|
+
# FAANG company jobs
|
|
73
|
+
jobhunter search "software engineer" --sources google_careers,amazon,netflix
|
|
74
|
+
|
|
75
|
+
# Export to CSV
|
|
76
|
+
jobhunter search "backend developer" --sources naukri,linkedin --output jobs.csv
|
|
77
|
+
|
|
78
|
+
# Top 20 ranked results
|
|
79
|
+
jobhunter search "machine learning" --sources naukri,shine,linkedin --top 20
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
---
|
|
83
|
+
|
|
84
|
+
## ๐ง Python API Reference
|
|
85
|
+
|
|
86
|
+
### `scrape_jobs()`
|
|
87
|
+
|
|
88
|
+
```python
|
|
89
|
+
from jobhunter import scrape_jobs
|
|
90
|
+
|
|
91
|
+
jobs = scrape_jobs(
|
|
92
|
+
search_term="python developer", # What to search
|
|
93
|
+
sources=["naukri", "shine"], # Which sources (list or "auto")
|
|
94
|
+
city="Bengaluru", # City filter (optional)
|
|
95
|
+
location="India", # Broader location (optional)
|
|
96
|
+
country="India", # Country (optional)
|
|
97
|
+
results_wanted=50, # Max results per source
|
|
98
|
+
job_kind="job", # "job", "internship", "hackathon"
|
|
99
|
+
remote=None, # True = remote only
|
|
100
|
+
salary_min=500000, # Min salary in INR (optional)
|
|
101
|
+
posted_within_days=30, # Only jobs from last N days
|
|
102
|
+
skills=["python", "django"], # Skill filter (optional)
|
|
103
|
+
experience_min=0, # Min years experience (optional)
|
|
104
|
+
experience_max=5, # Max years experience (optional)
|
|
105
|
+
)
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
### `Job` Object
|
|
109
|
+
|
|
110
|
+
Every source returns the same normalized `Job` dataclass:
|
|
111
|
+
|
|
112
|
+
```python
|
|
113
|
+
@dataclass
|
|
114
|
+
class Job:
|
|
115
|
+
title: str
|
|
116
|
+
company: str
|
|
117
|
+
source: str
|
|
118
|
+
job_url: str
|
|
119
|
+
|
|
120
|
+
location: str
|
|
121
|
+
city: str
|
|
122
|
+
country: str
|
|
123
|
+
work_mode: WorkMode # "remote" | "hybrid" | "onsite" | "unknown"
|
|
124
|
+
job_kind: JobKind # "job" | "internship" | "hackathon" | "competition"
|
|
125
|
+
|
|
126
|
+
salary: Money # min_amount, max_amount, currency, period
|
|
127
|
+
stipend: Money
|
|
128
|
+
|
|
129
|
+
skills: list[str]
|
|
130
|
+
experience_min: float | None
|
|
131
|
+
experience_max: float | None
|
|
132
|
+
description: str
|
|
133
|
+
date_posted: str | None
|
|
134
|
+
deadline: str | None # for competitions/hackathons
|
|
135
|
+
|
|
136
|
+
match_score: float # 0.0โ1.0 after ranking
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
### Export
|
|
140
|
+
|
|
141
|
+
```python
|
|
142
|
+
from jobhunter import scrape_jobs
|
|
143
|
+
from jobhunter.exporters import to_csv, to_json, to_dataframe
|
|
144
|
+
|
|
145
|
+
jobs = scrape_jobs("python developer", sources=["naukri", "shine"])
|
|
146
|
+
|
|
147
|
+
to_csv(jobs, "jobs.csv")
|
|
148
|
+
to_json(jobs, "jobs.json")
|
|
149
|
+
df = to_dataframe(jobs) # pandas DataFrame
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
---
|
|
153
|
+
|
|
154
|
+
## ๐๏ธ Project Structure
|
|
155
|
+
|
|
156
|
+
```
|
|
157
|
+
jobhunter/
|
|
158
|
+
โโโ __init__.py # scrape_jobs() entry point
|
|
159
|
+
โโโ models.py # Job, Money, WorkMode, JobKind dataclasses
|
|
160
|
+
โโโ query.py # JobQuery โ unified search parameters
|
|
161
|
+
โโโ engine.py # Orchestrates parallel scraping + dedup
|
|
162
|
+
โโโ registry.py # Scraper registry + auto-source selection
|
|
163
|
+
โโโ filtering.py # Soft filtering (salary, city, skills, date)
|
|
164
|
+
โโโ ranking.py # Relevance scoring / match_score
|
|
165
|
+
โโโ validation.py # Input validation
|
|
166
|
+
โโโ exceptions.py # Custom exceptions
|
|
167
|
+
โโโ cli.py # `jobhunter` CLI entry point
|
|
168
|
+
โ
|
|
169
|
+
โโโ scrapers/
|
|
170
|
+
โ โโโ base.py # BaseScraper ABC
|
|
171
|
+
โ โโโ naukri.py # ๐ฎ๐ณ Naukri โ /jobapi/v2/search REST API
|
|
172
|
+
โ โโโ shine.py # ๐ฎ๐ณ Shine โ __NEXT_DATA__ SSR JSON
|
|
173
|
+
โ โโโ internshala.py # ๐ฎ๐ณ Internshala โ HTML + pagination
|
|
174
|
+
โ โโโ unstop.py # ๐ฎ๐ณ Unstop โ hackathons REST API
|
|
175
|
+
โ โโโ linkedin.py # ๐ LinkedIn โ guest HTML API
|
|
176
|
+
โ โโโ indeed.py # ๐ Indeed โ GraphQL API
|
|
177
|
+
โ โโโ faang.py # ๐ Google, Amazon, Meta, Apple, Netflix, Microsoft
|
|
178
|
+
โ
|
|
179
|
+
โโโ exporters/
|
|
180
|
+
โ โโโ csv_exporter.py
|
|
181
|
+
โ โโโ json_exporter.py
|
|
182
|
+
โ โโโ dataframe.py
|
|
183
|
+
โ
|
|
184
|
+
โโโ utils/
|
|
185
|
+
โโโ fetchers.py # CachedFetcher with proxy + backend support
|
|
186
|
+
โโโ normalization.py # clean_text, parse_money, normalize_city, ...
|
|
187
|
+
|
|
188
|
+
tests/
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
---
|
|
192
|
+
|
|
193
|
+
## ๐ Source Details
|
|
194
|
+
|
|
195
|
+
### ๐ฎ๐ณ Naukri
|
|
196
|
+
- **Endpoint:** `GET https://www.naukri.com/jobapi/v2/search`
|
|
197
|
+
- **Auth:** Session cookies from page warm-up (automatic)
|
|
198
|
+
- **Fields:** Title, company, salary (LPA), location, skills, experience, date
|
|
199
|
+
- **Pagination:** `pageNo=N`, 20 results/page, 3,000+ pages available
|
|
200
|
+
|
|
201
|
+
### ๐ฎ๐ณ Shine
|
|
202
|
+
- **Endpoint:** `__NEXT_DATA__` SSR JSON embedded in HTML
|
|
203
|
+
- **Fields:** `jJT` (title), `jCName` (company), `jSal` (salary), `jLoc` (location), `jKwd` (skills), `jPDate` (date), `jSlug` (URL)
|
|
204
|
+
- **Pagination:** `?page=N`, 20 results/page, 900+ pages
|
|
205
|
+
|
|
206
|
+
### ๐ฎ๐ณ Internshala
|
|
207
|
+
- **Endpoint:** HTML scraping โ `div[id^='individual_internship_'][internshipid]`
|
|
208
|
+
- **Pagination:** `?page=N`, 40+ cards/page
|
|
209
|
+
- **City filter:** URL slug e.g. `/internships/python-intern-in-bengaluru/`
|
|
210
|
+
|
|
211
|
+
### ๐ฎ๐ณ Unstop
|
|
212
|
+
- **Endpoint:** `GET https://unstop.com/api/public/opportunity/search-result`
|
|
213
|
+
- **Note:** Returns hackathons, coding competitions, and challenges only
|
|
214
|
+
- **Fields:** Title, organisation, skills, location, deadline, prize
|
|
215
|
+
|
|
216
|
+
### ๐ Indeed
|
|
217
|
+
- **Endpoint:** `POST https://apis.indeed.com/graphql`
|
|
218
|
+
- **Auth:** Public API key (included)
|
|
219
|
+
- **Pagination:** Cursor-based
|
|
220
|
+
|
|
221
|
+
### ๐ LinkedIn
|
|
222
|
+
- **Endpoint:** `GET https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search`
|
|
223
|
+
- **Auth:** None โ guest API
|
|
224
|
+
- **FAANG filter:** `f_C` company ID parameter
|
|
225
|
+
|
|
226
|
+
### ๐ Amazon
|
|
227
|
+
- **Endpoint:** `GET https://www.amazon.jobs/en/search.json`
|
|
228
|
+
- **Auth:** None โ public REST API
|
|
229
|
+
|
|
230
|
+
---
|
|
231
|
+
|
|
232
|
+
## โ๏ธ Filtering
|
|
233
|
+
|
|
234
|
+
Filters are **soft by default** โ jobs missing a field pass through rather than being dropped:
|
|
235
|
+
|
|
236
|
+
```python
|
|
237
|
+
jobs = scrape_jobs(
|
|
238
|
+
"python developer",
|
|
239
|
+
sources=["naukri", "shine"],
|
|
240
|
+
salary_min=600_000, # Only applied if salary data exists
|
|
241
|
+
city="Bengaluru", # Only applied if location data exists
|
|
242
|
+
skills=["python", "sql"], # Only applied if skills data exists
|
|
243
|
+
posted_within_days=14, # Only applied if date data exists
|
|
244
|
+
)
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
---
|
|
248
|
+
|
|
249
|
+
## ๐ Advanced Usage
|
|
250
|
+
|
|
251
|
+
### FAANG-only search
|
|
252
|
+
|
|
253
|
+
```python
|
|
254
|
+
from jobhunter import scrape_jobs
|
|
255
|
+
from jobhunter.registry import default_registry
|
|
256
|
+
|
|
257
|
+
registry = default_registry()
|
|
258
|
+
faang = registry.faang_sources() # ['google_careers', 'amazon', 'meta', 'apple', 'netflix', 'microsoft']
|
|
259
|
+
|
|
260
|
+
jobs = scrape_jobs(
|
|
261
|
+
search_term="software engineer",
|
|
262
|
+
sources=faang,
|
|
263
|
+
results_wanted=20,
|
|
264
|
+
)
|
|
265
|
+
```
|
|
266
|
+
|
|
267
|
+
### Parallel scraping with custom config
|
|
268
|
+
|
|
269
|
+
```python
|
|
270
|
+
jobs = scrape_jobs(
|
|
271
|
+
search_term="backend developer",
|
|
272
|
+
sources=["naukri", "shine", "linkedin"],
|
|
273
|
+
city="Hyderabad",
|
|
274
|
+
results_wanted=100,
|
|
275
|
+
posted_within_days=7,
|
|
276
|
+
cache_enabled=True, # Cache responses locally
|
|
277
|
+
proxies=["http://..."], # Optional proxy list
|
|
278
|
+
)
|
|
279
|
+
```
|
|
280
|
+
|
|
281
|
+
### Auto-source selection
|
|
282
|
+
|
|
283
|
+
```python
|
|
284
|
+
# Automatically picks India sources when country="India"
|
|
285
|
+
jobs = scrape_jobs(
|
|
286
|
+
search_term="python developer",
|
|
287
|
+
country="India",
|
|
288
|
+
sources="auto", # โ [indeed, linkedin, internshala, naukri, shine, unstop]
|
|
289
|
+
)
|
|
290
|
+
```
|
|
291
|
+
|
|
292
|
+
---
|
|
293
|
+
|
|
294
|
+
## ๐งช Running Tests
|
|
295
|
+
|
|
296
|
+
```bash
|
|
297
|
+
pip install -e .
|
|
298
|
+
pytest tests/
|
|
299
|
+
```
|
|
300
|
+
|
|
301
|
+
---
|
|
302
|
+
|
|
303
|
+
## ๐ License
|
|
304
|
+
|
|
305
|
+
MIT
|