python-job-scraper 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,331 @@
1
+ """Utility functions for parsing LinkedIn job data."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from datetime import date
7
+ from typing import Any
8
+
9
+ from bs4 import BeautifulSoup
10
+
11
+ from jobscraper.linkedin.constant import JOB_TYPE_FILTER, JOB_TYPE_MAP
12
+ from jobscraper.model import Compensation, CompensationInterval, Location, ScraperInput
13
+ from jobscraper.util import extract_emails_from_text, markdown_converter
14
+
15
+
16
+ def parse_search_html(html: str) -> list[dict[str, Any]]:
17
+ """Extract job listing dicts from a LinkedIn public search results page.
18
+
19
+ Parses ``<div class="base-card">`` job cards and returns a list of dicts
20
+ with keys: id, title, company, location, date, job_url.
21
+
22
+ Args:
23
+ html: Raw HTML from a LinkedIn /jobs/search/ page.
24
+
25
+ Returns:
26
+ List of raw job dicts, or empty list if none found.
27
+ """
28
+ soup = BeautifulSoup(html, "lxml")
29
+ results: list[dict[str, Any]] = []
30
+
31
+ cards = soup.find_all("div", class_="base-card")
32
+ for card in cards:
33
+ try:
34
+ urn = card.get("data-entity-urn", "")
35
+ job_id = urn.split(":")[-1] if urn else card.get("data-job-id", "")
36
+ if not job_id:
37
+ continue
38
+
39
+ title_tag = card.find("h3", class_="base-search-card__title")
40
+ title = title_tag.get_text(strip=True) if title_tag else None
41
+
42
+ company_tag = card.find("h4", class_="base-search-card__subtitle")
43
+ company = company_tag.get_text(strip=True) if company_tag else None
44
+
45
+ loc_tag = card.find("span", class_="job-search-card__location")
46
+ location = loc_tag.get_text(strip=True) if loc_tag else None
47
+
48
+ time_tag = card.find("time")
49
+ date_str = time_tag.get("datetime") if time_tag else None
50
+
51
+ a_tag = card.find("a", class_="base-card__full-link")
52
+ job_url = a_tag.get("href") if a_tag else None
53
+
54
+ results.append({
55
+ "id": job_id,
56
+ "title": title,
57
+ "company": company,
58
+ "location": location,
59
+ "date": date_str,
60
+ "job_url": job_url,
61
+ })
62
+ except Exception:
63
+ continue
64
+
65
+ return results
66
+
67
+
68
+ def parse_location(raw: str) -> Location:
69
+ """Parse a LinkedIn location string into a Location model.
70
+
71
+ Handles ``"Bengaluru, Karnataka, India"``, ``"Remote in Mumbai, Maharashtra"``,
72
+ or a bare city name.
73
+
74
+ Args:
75
+ raw: Raw location string.
76
+
77
+ Returns:
78
+ Populated Location model.
79
+ """
80
+ if not raw:
81
+ return Location()
82
+
83
+ clean = re.sub(r"^(?:remote\s+in\s+)", "", raw.strip(), flags=re.IGNORECASE)
84
+ clean = re.sub(r"\s*\d{5,6}\s*$", "", clean).strip()
85
+ parts = [p.strip() for p in clean.split(",") if p.strip()]
86
+
87
+ if len(parts) >= 3:
88
+ return Location(city=parts[0], state=parts[1], country=parts[2])
89
+ elif len(parts) == 2:
90
+ return Location(city=parts[0], state=parts[1])
91
+ elif len(parts) == 1:
92
+ return Location(city=parts[0])
93
+ return Location()
94
+
95
+
96
+ def build_search_params(scraper_input: ScraperInput, start: int) -> dict[str, Any]:
97
+ """Build query parameters for a LinkedIn /jobs/search/ request.
98
+
99
+ Args:
100
+ scraper_input: Validated scraper configuration.
101
+ start: Result offset (0, 25, 50, …).
102
+
103
+ Returns:
104
+ Dict of query parameters.
105
+ """
106
+ params: dict[str, Any] = {
107
+ "keywords": scraper_input.search_term,
108
+ "start": start,
109
+ "pageNum": 0,
110
+ }
111
+ if scraper_input.location:
112
+ params["location"] = scraper_input.location
113
+ if scraper_input.distance is not None:
114
+ miles = scraper_input.distance * 0.621
115
+ params["distance"] = min([5, 10, 25, 50, 100], key=lambda x: abs(x - miles))
116
+ if scraper_input.job_type:
117
+ code = JOB_TYPE_FILTER.get(scraper_input.job_type.value)
118
+ if code:
119
+ params["f_JT"] = code
120
+ if scraper_input.hours_old:
121
+ params["f_TPR"] = f"r{scraper_input.hours_old * 3600}"
122
+ if scraper_input.is_remote:
123
+ params["f_WT"] = "2"
124
+ return params
125
+
126
+
127
+ def parse_voyager_job(data: dict[str, Any]) -> dict[str, Any]:
128
+ """Extract fields from a LinkedIn Voyager /jobs/jobPostings/{id} response.
129
+
130
+ Args:
131
+ data: The ``data`` sub-dict (or full response if already unwrapped)
132
+ from the Voyager API JSON response.
133
+
134
+ Returns:
135
+ Dict with keys: title, description_html, employment_status, is_remote,
136
+ listed_at, formatted_location, job_url_direct, is_easy_apply, company,
137
+ company_url, company_logo, salary.
138
+ """
139
+ out: dict[str, Any] = {}
140
+
141
+ out["title"] = data.get("title")
142
+
143
+ desc_data = data.get("description") or {}
144
+ out["description_html"] = desc_data.get("text") if isinstance(desc_data, dict) else None
145
+
146
+ emp = (data.get("employmentStatus") or "").lower().replace("_", "-")
147
+ out["employment_status"] = emp
148
+
149
+ wtypes = data.get("workplaceTypes") or []
150
+ out["is_remote"] = any(w.upper() in ("REMOTE", "HYBRID") for w in wtypes)
151
+
152
+ out["listed_at"] = data.get("listedAt")
153
+ out["formatted_location"] = data.get("formattedLocation")
154
+
155
+ # Apply method
156
+ apply = data.get("applyMethod") or {}
157
+ offsite_key = next((k for k in apply if "OffsiteApply" in k), None)
158
+ onsite_key = next((k for k in apply if "OnsiteApply" in k or "ComplexOnsite" in k), None)
159
+ if offsite_key:
160
+ out["job_url_direct"] = (apply[offsite_key] or {}).get("websiteUrl")
161
+ out["is_easy_apply"] = False
162
+ elif onsite_key:
163
+ out["job_url_direct"] = None
164
+ out["is_easy_apply"] = True
165
+ else:
166
+ out["job_url_direct"] = None
167
+ out["is_easy_apply"] = None
168
+
169
+ # Company details — key name varies by decoration version
170
+ cd = data.get("companyDetails") or {}
171
+ company_key = next(
172
+ (k for k in cd if "WebJobPostingCompany" in k or "Company" in k), None
173
+ )
174
+ if company_key:
175
+ res = (cd[company_key] or {}).get("companyResolutionResult") or {}
176
+ out["company"] = res.get("name")
177
+ out["company_url"] = res.get("url")
178
+ logo_data = res.get("logo") or {}
179
+ vi_key = next((k for k in logo_data if "VectorImage" in k), None)
180
+ out["company_logo"] = (logo_data[vi_key] or {}).get("rootUrl") if vi_key else None
181
+ else:
182
+ out["company"] = None
183
+ out["company_url"] = None
184
+ out["company_logo"] = None
185
+
186
+ out["salary"] = _extract_voyager_salary(data)
187
+
188
+ return out
189
+
190
+
191
+ def _extract_voyager_salary(data: dict[str, Any]) -> dict[str, Any] | None:
192
+ """Extract salary range from a Voyager job response dict.
193
+
194
+ Tries ``salary.salaryInsight.baseSalary`` first, then
195
+ ``jobSalaryHighQualityMetadata.salaryRange`` as fallback.
196
+
197
+ Args:
198
+ data: Voyager job response dict.
199
+
200
+ Returns:
201
+ Dict with keys min, max, currency, unit — or None if no salary data.
202
+ """
203
+ try:
204
+ base = data["salary"]["salaryInsight"]["baseSalary"]
205
+ return {
206
+ "min": base["minValue"]["value"],
207
+ "max": base["maxValue"]["value"],
208
+ "currency": base.get("currencyCode", "INR"),
209
+ "unit": base.get("unitOfWork", "YEAR"),
210
+ }
211
+ except (KeyError, TypeError):
212
+ pass
213
+
214
+ try:
215
+ sr = data["jobSalaryHighQualityMetadata"]["salaryRange"]
216
+ return {
217
+ "min": sr["min"],
218
+ "max": sr["max"],
219
+ "currency": sr.get("currencyCode", "INR"),
220
+ "unit": sr.get("compensationType", "ANNUAL"),
221
+ }
222
+ except (KeyError, TypeError):
223
+ pass
224
+
225
+ return None
226
+
227
+
228
+ def parse_compensation(salary_dict: dict[str, Any] | None) -> Compensation | None:
229
+ """Convert a salary dict (from _extract_voyager_salary) to a Compensation model.
230
+
231
+ Args:
232
+ salary_dict: Dict with keys min, max, currency, unit — or None.
233
+
234
+ Returns:
235
+ Compensation model or None if input is None or missing required fields.
236
+ """
237
+ if not salary_dict:
238
+ return None
239
+
240
+ unit = (salary_dict.get("unit") or "ANNUAL").upper()
241
+ interval_map: dict[str, CompensationInterval] = {
242
+ "ANNUAL": CompensationInterval.YEARLY,
243
+ "YEAR": CompensationInterval.YEARLY,
244
+ "MONTHLY": CompensationInterval.MONTHLY,
245
+ "MONTH": CompensationInterval.MONTHLY,
246
+ "HOURLY": CompensationInterval.HOURLY,
247
+ "HOUR": CompensationInterval.HOURLY,
248
+ "WEEKLY": CompensationInterval.WEEKLY,
249
+ "WEEK": CompensationInterval.WEEKLY,
250
+ "DAILY": CompensationInterval.DAILY,
251
+ "DAY": CompensationInterval.DAILY,
252
+ }
253
+ interval = interval_map.get(unit)
254
+
255
+ min_val = salary_dict.get("min")
256
+ max_val = salary_dict.get("max")
257
+ if min_val is None and max_val is None:
258
+ return None
259
+
260
+ try:
261
+ return Compensation(
262
+ interval=interval,
263
+ min_amount=float(min_val) if min_val is not None else None,
264
+ max_amount=float(max_val) if max_val is not None else None,
265
+ currency=salary_dict.get("currency") or "INR",
266
+ )
267
+ except (ValueError, TypeError):
268
+ return None
269
+
270
+
271
+ def parse_html_detail(
272
+ html: str,
273
+ description_format: str,
274
+ ) -> tuple[str | None, str | None, list[str] | None]:
275
+ """Parse description, direct apply URL, and emails from a LinkedIn job detail page.
276
+
277
+ Args:
278
+ html: Raw HTML from a ``/jobs/view/{id}/`` page.
279
+ description_format: ``"markdown"`` or ``"html"``.
280
+
281
+ Returns:
282
+ Tuple of (description, job_url_direct, emails). Any element may be None.
283
+ """
284
+ soup = BeautifulSoup(html, "lxml")
285
+
286
+ # Description — try multiple selectors in priority order
287
+ desc_tag = (
288
+ soup.find("div", class_="show-more-less-html__markup")
289
+ or soup.find("div", {"id": "job-details"})
290
+ or soup.find("div", class_="description__text")
291
+ )
292
+ description: str | None = None
293
+ if desc_tag:
294
+ raw_html = str(desc_tag)
295
+ description = (
296
+ markdown_converter(raw_html)
297
+ if description_format == "markdown"
298
+ else raw_html
299
+ )
300
+
301
+ # External apply URL — only capture non-LinkedIn links
302
+ job_url_direct: str | None = None
303
+ apply_tag = soup.find("a", class_="apply-button") or soup.find(
304
+ "a", attrs={"data-tracking-control-name": "public_jobs_apply-link-offsite"}
305
+ )
306
+ if apply_tag:
307
+ href = apply_tag.get("href") or ""
308
+ if href.startswith("http") and "linkedin.com" not in href:
309
+ job_url_direct = str(href)
310
+
311
+ # Emails
312
+ raw_emails = list(set(extract_emails_from_text(soup.get_text()))) or None
313
+
314
+ return description, job_url_direct, raw_emails
315
+
316
+
317
+ def parse_date(date_str: str | None) -> date | None:
318
+ """Parse an ISO 8601 date string to a date object.
319
+
320
+ Args:
321
+ date_str: String like ``"2026-04-01"`` or None.
322
+
323
+ Returns:
324
+ date object or None.
325
+ """
326
+ if not date_str:
327
+ return None
328
+ try:
329
+ return date.fromisoformat(date_str)
330
+ except (ValueError, TypeError):
331
+ return None
jobscraper/model.py ADDED
@@ -0,0 +1,144 @@
1
+ """Pydantic v2 models and Scraper abstract base class for jobscraper.
2
+
3
+ This module defines all shared data models, enums, and the abstract Scraper
4
+ interface used across all platform scrapers.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from abc import ABC, abstractmethod
10
+ from datetime import date
11
+ from enum import Enum
12
+ from typing import Literal
13
+
14
+ from pydantic import BaseModel
15
+
16
+
17
+ class Site(str, Enum):
18
+ """Supported job platforms."""
19
+
20
+ INDEED = "indeed"
21
+ GLASSDOOR = "glassdoor"
22
+ LINKEDIN = "linkedin"
23
+ # NAUKRI = "naukri" # planned
24
+ # FOUNDIT = "foundit" # planned
25
+ # SHINE = "shine" # planned
26
+ # INTERNSHALA = "internshala" # planned
27
+ # UPWORK = "upwork" # planned
28
+ # APNA = "apna" # planned
29
+
30
+
31
+ class Country(str, Enum):
32
+ """Supported countries for job searches."""
33
+
34
+ INDIA = "in"
35
+
36
+ @classmethod
37
+ def from_string(cls, value: str) -> "Country":
38
+ """Convert string like 'india' or 'in' to Country enum."""
39
+ value_lower = value.lower().strip()
40
+ mapping = {"india": cls.INDIA, "in": cls.INDIA}
41
+ if value_lower in mapping:
42
+ return mapping[value_lower]
43
+ raise ValueError(f"Unknown country: {value!r}")
44
+
45
+
46
+ class CompensationInterval(str, Enum):
47
+ """Pay interval for compensation amounts."""
48
+
49
+ YEARLY = "yearly"
50
+ MONTHLY = "monthly"
51
+ WEEKLY = "weekly"
52
+ DAILY = "daily"
53
+ HOURLY = "hourly"
54
+
55
+
56
+ class JobType(str, Enum):
57
+ """Employment type for a job posting."""
58
+
59
+ FULL_TIME = "fulltime"
60
+ PART_TIME = "parttime"
61
+ CONTRACT = "contract"
62
+ TEMPORARY = "temporary"
63
+ INTERNSHIP = "internship"
64
+
65
+
66
+ class Location(BaseModel):
67
+ """Geographic location for a job posting."""
68
+
69
+ city: str | None = None
70
+ state: str | None = None
71
+ country: str | None = None
72
+
73
+ def display_location(self) -> str:
74
+ """Return a human-readable location string."""
75
+ parts = [p for p in [self.city, self.state, self.country] if p]
76
+ return ", ".join(parts) if parts else ""
77
+
78
+
79
+ class Compensation(BaseModel):
80
+ """Salary or pay information for a job posting."""
81
+
82
+ interval: CompensationInterval | None = None
83
+ min_amount: float | None = None
84
+ max_amount: float | None = None
85
+ currency: str = "INR"
86
+
87
+
88
+ class JobPost(BaseModel):
89
+ """A single job posting with all available metadata."""
90
+
91
+ id: str
92
+ site: Site
93
+ job_url: str
94
+ job_url_direct: str | None = None
95
+ title: str
96
+ company: str | None = None
97
+ location: Location | None = None
98
+ date_posted: date | None = None
99
+ job_type: list[JobType] | None = None
100
+ compensation: Compensation | None = None
101
+ is_remote: bool | None = None
102
+ is_indeed_apply: bool | None = None
103
+ job_level: str | None = None
104
+ description: str | None = None
105
+ emails: list[str] | None = None
106
+ company_url: str | None = None
107
+ company_logo: str | None = None
108
+
109
+
110
+ class JobResponse(BaseModel):
111
+ """Container for a list of job postings returned by a scraper."""
112
+
113
+ jobs: list[JobPost] = []
114
+
115
+
116
+ class ScraperInput(BaseModel):
117
+ """Validated input parameters passed to a scraper."""
118
+
119
+ site_name: list[Site]
120
+ search_term: str
121
+ location: str | None = None
122
+ distance: int | None = 50
123
+ hours_old: int | None = None
124
+ results_wanted: int = 20
125
+ offset: int = 0
126
+ job_type: JobType | None = None
127
+ is_remote: bool = False
128
+ cookies: dict[str, str] | None = None
129
+ country_indeed: Country = Country.INDIA
130
+ description_format: Literal["markdown", "html"] = "markdown"
131
+ fetch_full_description: bool = True
132
+ proxies: list[str] | None = None
133
+ ca_cert: str | None = None
134
+ enforce_annual_salary: bool = False
135
+ user_agent: str | None = None
136
+
137
+
138
+ class Scraper(ABC):
139
+ """Abstract base class that all platform scrapers must implement."""
140
+
141
+ @abstractmethod
142
+ def scrape(self, scraper_input: ScraperInput) -> JobResponse:
143
+ """Run the scraper and return a JobResponse containing job postings."""
144
+ ...