python-job-scraper 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
jobscraper/util.py ADDED
@@ -0,0 +1,500 @@
1
+ """Shared utilities: session factories, logging helpers, salary parsing, and converters."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import itertools
6
+ import logging
7
+ import re
8
+ from typing import Any
9
+
10
+ import requests
11
+ from bs4 import BeautifulSoup
12
+ from markdownify import markdownify
13
+
14
+ from jobscraper.model import CompensationInterval, JobType, Site
15
+
16
+ # ---------------------------------------------------------------------------
17
+ # Module-level constant: canonical column ordering for job result output
18
+ # ---------------------------------------------------------------------------
19
+
20
+ desired_order: list[str] = [
21
+ "id",
22
+ "site",
23
+ "job_url",
24
+ "job_url_direct",
25
+ "title",
26
+ "company",
27
+ "location",
28
+ "date_posted",
29
+ "job_type",
30
+ "salary_source",
31
+ "interval",
32
+ "min_amount",
33
+ "max_amount",
34
+ "currency",
35
+ "is_remote",
36
+ "is_indeed_apply",
37
+ "job_level",
38
+ "company_url",
39
+ "company_logo",
40
+ "emails",
41
+ "description",
42
+ ]
43
+
44
+ # ---------------------------------------------------------------------------
45
+ # Logging
46
+ # ---------------------------------------------------------------------------
47
+
48
+
49
+ def create_logger(name: str) -> logging.Logger:
50
+ """Return a jobscraper:<name> logger with asctime-levelname-name-message format.
51
+
52
+ Prevents duplicate handlers by checking if any handlers are already attached.
53
+ """
54
+ logger = logging.getLogger(f"jobscraper:{name}")
55
+ if not logger.handlers:
56
+ handler = logging.StreamHandler()
57
+ handler.setFormatter(
58
+ logging.Formatter("%(asctime)s - %(levelname)s - %(name)s - %(message)s")
59
+ )
60
+ logger.addHandler(handler)
61
+ logger.propagate = False
62
+ return logger
63
+
64
+
65
+ def set_logger_level(verbose: int) -> None:
66
+ """Adjust log level for all jobscraper:* loggers.
67
+
68
+ Maps 0 -> ERROR, 1 -> WARNING, 2 -> INFO. Values above 2 default to INFO.
69
+ """
70
+ level_map = {0: logging.ERROR, 1: logging.WARNING, 2: logging.INFO}
71
+ level = level_map.get(verbose, logging.INFO)
72
+ for name, logger in logging.Logger.manager.loggerDict.items():
73
+ if name.startswith("jobscraper:") and isinstance(logger, logging.Logger):
74
+ logger.setLevel(level)
75
+
76
+
77
+ # ---------------------------------------------------------------------------
78
+ # HTML / text converters
79
+ # ---------------------------------------------------------------------------
80
+
81
+
82
+ def markdown_converter(html: str | None) -> str | None:
83
+ """Convert HTML to markdown using markdownify.
84
+
85
+ Returns None for null input.
86
+ """
87
+ if html is None:
88
+ return None
89
+ return markdownify(html)
90
+
91
+
92
+ def plain_converter(html: str | None) -> str | None:
93
+ """Strip HTML tags via BeautifulSoup and collapse whitespace into plain text.
94
+
95
+ Returns None for null input.
96
+ """
97
+ if html is None:
98
+ return None
99
+ soup = BeautifulSoup(html, "lxml")
100
+ return " ".join(soup.get_text().split())
101
+
102
+
103
+ def remove_attributes(tag: Any) -> None:
104
+ """Strip all HTML attributes from a BeautifulSoup tag in-place.
105
+
106
+ Modifies the tag and all its descendants directly.
107
+ """
108
+ for child in tag.find_all(True):
109
+ child.attrs = {}
110
+ tag.attrs = {}
111
+
112
+
113
+ # ---------------------------------------------------------------------------
114
+ # Email extraction
115
+ # ---------------------------------------------------------------------------
116
+
117
+
118
+ def extract_emails_from_text(text: str) -> list[str]:
119
+ """Regex scan returning a list of email addresses found in the given text."""
120
+ pattern = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
121
+ return re.findall(pattern, text)
122
+
123
+
124
+ # ---------------------------------------------------------------------------
125
+ # Enum helpers
126
+ # ---------------------------------------------------------------------------
127
+
128
+
129
+ def get_enum_from_job_type(value: str) -> JobType | None:
130
+ """Match a string against JobType enum values or names.
131
+
132
+ Returns the matching JobType member, or None if no match is found.
133
+ """
134
+ value_lower = value.lower().strip()
135
+ for member in JobType:
136
+ if member.value == value_lower or member.name.lower() == value_lower:
137
+ return member
138
+ return None
139
+
140
+
141
+ def get_enum_from_value(value: str) -> JobType:
142
+ """Same as get_enum_from_job_type but raises ValueError if no match is found."""
143
+ result = get_enum_from_job_type(value)
144
+ if result is None:
145
+ raise ValueError(f"Unknown job type: {value!r}")
146
+ return result
147
+
148
+
149
+ def map_str_to_site(name: str) -> Site:
150
+ """Convert a string (case-insensitive) to the corresponding Site enum member.
151
+
152
+ Raises ValueError if the name does not match any known site.
153
+ """
154
+ try:
155
+ return Site[name.upper()]
156
+ except KeyError:
157
+ raise ValueError(f"Unknown site: {name!r}")
158
+
159
+
160
+ # ---------------------------------------------------------------------------
161
+ # Currency and salary parsing
162
+ # ---------------------------------------------------------------------------
163
+
164
+
165
+ def currency_parser(value: str) -> float:
166
+ """Strip non-numeric characters, handle thousands separators, return rounded float.
167
+
168
+ Handles European-style decimal commas: a comma is treated as a decimal separator
169
+ only when there is exactly one comma, no period, and the part after the comma is
170
+ NOT exactly 3 digits (which would indicate a thousands separator, e.g. "50,000").
171
+ """
172
+ cleaned = re.sub(r"[^\d.,]", "", value.strip())
173
+ # Determine whether the comma is a decimal separator or a thousands separator.
174
+ # If there is one comma, no dot, and the fractional part is exactly 3 digits,
175
+ # it is a thousands separator (e.g. "50,000" → 50000).
176
+ if cleaned.count(",") == 1 and "." not in cleaned:
177
+ after_comma = cleaned.split(",")[1]
178
+ if len(after_comma) == 3:
179
+ # Thousands separator — remove it
180
+ cleaned = cleaned.replace(",", "")
181
+ else:
182
+ # Decimal separator — replace with dot
183
+ cleaned = cleaned.replace(",", ".")
184
+ else:
185
+ # Multiple commas or a dot present — treat all commas as thousands separators
186
+ cleaned = cleaned.replace(",", "")
187
+ return round(float(cleaned), 2)
188
+
189
+
190
+ def _annualize(
191
+ interval: CompensationInterval, min_val: float, max_val: float
192
+ ) -> tuple[float, float]:
193
+ """Convert pay amounts to annual equivalents based on the given interval.
194
+
195
+ Uses standard multipliers: hourly * 2080, daily * 260, weekly * 52, monthly * 12.
196
+ """
197
+ multipliers: dict[CompensationInterval, int] = {
198
+ CompensationInterval.HOURLY: 2080, # 40h * 52w
199
+ CompensationInterval.DAILY: 260,
200
+ CompensationInterval.WEEKLY: 52,
201
+ CompensationInterval.MONTHLY: 12,
202
+ CompensationInterval.YEARLY: 1,
203
+ }
204
+ mult = multipliers.get(interval, 1)
205
+ return round(min_val * mult, 2), round(max_val * mult, 2)
206
+
207
+
208
+ def extract_salary(
209
+ text: str, enforce_annual: bool = False
210
+ ) -> tuple[str | None, float | None, float | None, str]:
211
+ """Parse salary patterns like '$min-$max' with optional 'k' suffix.
212
+
213
+ Detects interval by magnitude threshold:
214
+ - avg <= 1000 -> hourly
215
+ - avg <= 20000 -> monthly
216
+ - otherwise -> yearly
217
+
218
+ When enforce_annual=True the amounts are converted to annual equivalents.
219
+
220
+ Returns:
221
+ (interval_str | None, min_amount | None, max_amount | None, currency)
222
+ """
223
+ currency = "INR"
224
+ currency_symbols = {"$": "USD", "£": "GBP", "€": "EUR", "₹": "INR"}
225
+
226
+ for sym, cur in currency_symbols.items():
227
+ if sym in text:
228
+ currency = cur
229
+ break
230
+
231
+ # Try to match a min–max range first
232
+ pattern = r"([\d,\.]+)\s*[kK]?\s*[-\u2013to]+\s*([\d,\.]+)\s*[kK]?"
233
+ match = re.search(pattern, text)
234
+
235
+ if not match:
236
+ # Fall back to a single number
237
+ single = re.search(r"([\d,\.]+)\s*[kK]?", text)
238
+ if not single:
239
+ return None, None, None, currency
240
+ val_str = single.group(1)
241
+ try:
242
+ val = currency_parser(val_str)
243
+ if "k" in single.group(0).lower():
244
+ val *= 1000
245
+ except (ValueError, AttributeError):
246
+ return None, None, None, currency
247
+ min_val = max_val = val
248
+ else:
249
+ try:
250
+ min_str, max_str = match.group(1), match.group(2)
251
+ min_val = currency_parser(min_str)
252
+ max_val = currency_parser(max_str)
253
+ if "k" in match.group(0).lower():
254
+ min_val *= 1000
255
+ max_val *= 1000
256
+ except (ValueError, AttributeError):
257
+ return None, None, None, currency
258
+
259
+ # Detect interval by magnitude of average value
260
+ avg = (min_val + max_val) / 2
261
+ if avg <= 1000:
262
+ interval = CompensationInterval.HOURLY
263
+ elif avg <= 20000:
264
+ interval = CompensationInterval.MONTHLY
265
+ else:
266
+ interval = CompensationInterval.YEARLY
267
+
268
+ if enforce_annual:
269
+ min_val, max_val = _annualize(interval, min_val, max_val)
270
+ interval = CompensationInterval.YEARLY
271
+
272
+ return interval.value, min_val, max_val, currency
273
+
274
+
275
+ # ---------------------------------------------------------------------------
276
+ # Job type extraction from description text
277
+ # ---------------------------------------------------------------------------
278
+
279
+
280
+ def extract_job_type(description: str) -> list[JobType] | None:
281
+ """Scan job description text for employment-type keywords.
282
+
283
+ Returns a list of matching JobType enum values, or None if none are found.
284
+ """
285
+ text_lower = description.lower()
286
+ keywords: dict[JobType, list[str]] = {
287
+ JobType.FULL_TIME: ["full time", "full-time", "fulltime"],
288
+ JobType.PART_TIME: ["part time", "part-time", "parttime"],
289
+ JobType.INTERNSHIP: ["internship", "intern"],
290
+ JobType.CONTRACT: ["contract", "contractor", "freelance"],
291
+ JobType.TEMPORARY: ["temporary", "temp "],
292
+ }
293
+ found = [jt for jt, kws in keywords.items() if any(kw in text_lower for kw in kws)]
294
+ return found if found else None
295
+
296
+
297
+ # ---------------------------------------------------------------------------
298
+ # Annual salary conversion (mutates job dict in-place)
299
+ # ---------------------------------------------------------------------------
300
+
301
+
302
+ def convert_to_annual(job_data: dict) -> None:
303
+ """Mutate a job dict in-place, converting pay amounts to annual equivalents.
304
+
305
+ Reads ``interval``, ``min_amount``, and ``max_amount`` from the dict and
306
+ updates them when conversion is possible. Does nothing if interval is
307
+ missing or unrecognised.
308
+ """
309
+ interval_str = job_data.get("interval")
310
+ if not interval_str:
311
+ return
312
+ try:
313
+ interval = CompensationInterval(interval_str)
314
+ except ValueError:
315
+ return
316
+
317
+ min_val = job_data.get("min_amount")
318
+ max_val = job_data.get("max_amount")
319
+
320
+ if min_val is not None and max_val is not None:
321
+ new_min, new_max = _annualize(interval, min_val, max_val)
322
+ job_data["min_amount"] = new_min
323
+ job_data["max_amount"] = new_max
324
+ job_data["interval"] = CompensationInterval.YEARLY.value
325
+
326
+
327
+ # ---------------------------------------------------------------------------
328
+ # Proxy session classes
329
+ # ---------------------------------------------------------------------------
330
+
331
+
332
+ class RotatingProxySession:
333
+ """Base class that round-robins proxies from a string or list.
334
+
335
+ Accepts a comma-separated proxy string or a list of proxy strings.
336
+ Formats raw proxy strings into ``{"http": ..., "https": ...}`` dicts.
337
+ """
338
+
339
+ def __init__(self, proxies: list[str] | str | None = None) -> None:
340
+ """Initialize with optional proxy list or comma-separated string."""
341
+ if isinstance(proxies, str):
342
+ proxy_list = [p.strip() for p in proxies.split(",") if p.strip()]
343
+ elif proxies:
344
+ proxy_list = list(proxies)
345
+ else:
346
+ proxy_list = []
347
+ self._proxy_cycle: itertools.cycle | None = (
348
+ itertools.cycle(proxy_list) if proxy_list else None
349
+ )
350
+
351
+ def _get_proxy_dict(self) -> dict[str, str] | None:
352
+ """Return the next proxy dict or None if no proxies are configured."""
353
+ if self._proxy_cycle is None:
354
+ return None
355
+ proxy = next(self._proxy_cycle)
356
+ return {"http": proxy, "https": proxy}
357
+
358
+
359
+ class RequestsRotating(RotatingProxySession, requests.Session):
360
+ """requests.Session subclass with rotating proxy support.
361
+
362
+ Rotates proxies on each request and optionally clears cookies between
363
+ requests. Supports a custom CA certificate bundle.
364
+ """
365
+
366
+ def __init__(
367
+ self,
368
+ proxies: list[str] | str | None = None,
369
+ ca_cert: str | None = None,
370
+ clear_cookies: bool = False,
371
+ ) -> None:
372
+ """Initialize session with optional proxies, CA cert, and cookie clearing."""
373
+ RotatingProxySession.__init__(self, proxies)
374
+ requests.Session.__init__(self)
375
+ if ca_cert:
376
+ self.verify = ca_cert
377
+ self._clear_cookies = clear_cookies
378
+
379
+ def request(self, method: str, url: str, **kwargs):
380
+ """Make an HTTP request with proxy rotation and optional cookie clearing."""
381
+ if self._clear_cookies:
382
+ self.cookies.clear()
383
+ proxy = self._get_proxy_dict()
384
+ if proxy:
385
+ kwargs.setdefault("proxies", proxy)
386
+ return super().request(method, url, **kwargs)
387
+
388
+
389
+ class TLSRotating(RotatingProxySession):
390
+ """Wraps ``tls_client.Session`` with rotating TLS client identifiers.
391
+
392
+ Cycles through ``["chrome_120", "chrome_119", "firefox_108"]`` identifiers.
393
+ Falls back to a plain ``requests.Session`` if ``tls-client`` is not installed.
394
+ """
395
+
396
+ _IDENTIFIERS: list[str] = ["chrome_120", "chrome_119", "firefox_108"]
397
+
398
+ def __init__(
399
+ self,
400
+ proxies: list[str] | str | None = None,
401
+ ca_cert: str | None = None,
402
+ ) -> None:
403
+ """Initialize TLS session with identifier rotation and optional proxies."""
404
+ super().__init__(proxies)
405
+ self._ca_cert = ca_cert
406
+ self._id_cycle: itertools.cycle = itertools.cycle(self._IDENTIFIERS)
407
+ self._session = self._make_session()
408
+
409
+ def _make_session(self):
410
+ """Create a tls_client.Session or fall back to requests.Session."""
411
+ try:
412
+ import tls_client # type: ignore[import]
413
+
414
+ identifier = next(self._id_cycle)
415
+ return tls_client.Session(
416
+ client_identifier=identifier, random_tls_extension_order=True
417
+ )
418
+ except ImportError:
419
+ return requests.Session()
420
+
421
+ def get(self, url: str, **kwargs) -> Any:
422
+ """Perform a GET request, injecting the next rotating proxy if available."""
423
+ proxy = self._get_proxy_dict()
424
+ if proxy:
425
+ kwargs.setdefault("proxy", proxy.get("https"))
426
+ return self._session.get(url, **kwargs)
427
+
428
+ def post(self, url: str, **kwargs) -> Any:
429
+ """Perform a POST request, injecting the next rotating proxy if available."""
430
+ proxy = self._get_proxy_dict()
431
+ if proxy:
432
+ kwargs.setdefault("proxy", proxy.get("https"))
433
+ return self._session.post(url, **kwargs)
434
+
435
+
436
+ def create_session(
437
+ proxies: list[str] | str | None = None,
438
+ ca_cert: str | None = None,
439
+ is_tls: bool = True,
440
+ clear_cookies: bool = False,
441
+ ) -> TLSRotating | RequestsRotating:
442
+ """Factory returning a TLSRotating or RequestsRotating session.
443
+
444
+ Args:
445
+ proxies: Optional proxy string or list of proxy strings.
446
+ ca_cert: Optional path to a CA certificate bundle.
447
+ is_tls: If True (default) returns a TLSRotating session; otherwise
448
+ returns a RequestsRotating session.
449
+ clear_cookies: Passed to RequestsRotating when is_tls=False.
450
+
451
+ Returns:
452
+ A configured session instance ready for HTTP requests.
453
+ """
454
+ if is_tls:
455
+ return TLSRotating(proxies=proxies, ca_cert=ca_cert)
456
+ return RequestsRotating(
457
+ proxies=proxies, ca_cert=ca_cert, clear_cookies=clear_cookies
458
+ )
459
+
460
+
461
+ _SKIP_DOMAINS: frozenset[str] = frozenset(
462
+ [
463
+ "indeed.com",
464
+ "linkedin.com",
465
+ "glassdoor.com",
466
+ "facebook.com",
467
+ "twitter.com",
468
+ "youtube.com",
469
+ "wikipedia.org",
470
+ "instagram.com",
471
+ ]
472
+ )
473
+
474
+
475
+ def get_company_website(company_name: str) -> str | None:
476
+ """Return the official website for a company using DuckDuckGo search.
477
+
478
+ Searches for ``{company_name} official site``, iterates the first 5
479
+ results, and returns the first URL that does not belong to a known
480
+ aggregator or social domain. Returns None if nothing clean is found or
481
+ if the search raises an exception.
482
+
483
+ Args:
484
+ company_name: Name of the company to look up.
485
+
486
+ Returns:
487
+ URL string of the company's website, or None.
488
+ """
489
+ from ddgs import DDGS # type: ignore[import]
490
+
491
+ query = f"{company_name} official indian career site"
492
+ try:
493
+ results = DDGS().text(query, max_results=5)
494
+ for result in results:
495
+ url = result.get("href", "")
496
+ if url and not any(domain in url for domain in _SKIP_DOMAINS):
497
+ return url
498
+ except Exception:
499
+ pass
500
+ return None