PyPI - python-job-scraper - Versions diffs - 0.3.0__py3-none-any.whl - Mend

python-job-scraper 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

jobscraper/__init__.py +302 -0
jobscraper/exception.py +32 -0
jobscraper/glassdoor/__init__.py +309 -0
jobscraper/glassdoor/constant.py +33 -0
jobscraper/glassdoor/util.py +215 -0
jobscraper/indeed/__init__.py +331 -0
jobscraper/indeed/constant.py +38 -0
jobscraper/indeed/util.py +157 -0
jobscraper/linkedin/__init__.py +5 -0
jobscraper/linkedin/_scraper.py +283 -0
jobscraper/linkedin/constant.py +60 -0
jobscraper/linkedin/util.py +331 -0
jobscraper/model.py +144 -0
jobscraper/util.py +500 -0
python_job_scraper-0.3.0.dist-info/METADATA +221 -0
python_job_scraper-0.3.0.dist-info/RECORD +19 -0
python_job_scraper-0.3.0.dist-info/WHEEL +5 -0
python_job_scraper-0.3.0.dist-info/licenses/LICENSE +21 -0
python_job_scraper-0.3.0.dist-info/top_level.txt +1 -0

jobscraper/indeed/util.py ADDED Viewed

@@ -0,0 +1,157 @@
+"""Utility functions for parsing Indeed job data."""
+from __future__ import annotations
+import json
+import re
+from typing import Any
+from bs4 import BeautifulSoup
+from jobscraper.model import Compensation, CompensationInterval, Country, Location
+from jobscraper.util import currency_parser, extract_emails_from_text, extract_salary
+def parse_mosaic_json(html: str) -> list[dict[str, Any]]:
+    """Extract and parse the <script id="mosaic-data"> JSON blob from Indeed HTML.
+    Returns a list of job dicts from the jobKeysWithInfo section,
+    or an empty list on failure.
+    """
+    soup = BeautifulSoup(html, "lxml")
+    script_tag = soup.find("script", {"id": "mosaic-data"})
+    if not script_tag:
+        return []
+    try:
+        raw_js = script_tag.string or ""
+        # The script sets window.mosaic.providerData["mosaic-provider-jobcards"]
+        match = re.search(
+            r'window\.mosaic\.providerData\["mosaic-provider-jobcards"\]\s*=\s*(\{.*?\});',
+            raw_js,
+            re.DOTALL,
+        )
+        if match:
+            data = json.loads(match.group(1))
+        else:
+            data = json.loads(raw_js)
+        # Navigate to the job list
+        # Structure: metaData -> mosaicProviderJobCardsModel -> results
+        meta = data.get("metaData", {})
+        model = meta.get("mosaicProviderJobCardsModel", {})
+        results = model.get("results", [])
+        return results if isinstance(results, list) else []
+    except (json.JSONDecodeError, AttributeError, KeyError):
+        return []
+def parse_compensation(job_data: dict[str, Any]) -> Compensation | None:
+    """Extract compensation info from an Indeed job dict.
+    Indeed may provide salary under 'extractedSalary' (structured) or
+    'salarySnippet' (text). Returns a Compensation model or None if no
+    salary data is found.
+    """
+    # Try extractedSalary first (structured data)
+    extracted = job_data.get("extractedSalary")
+    if extracted:
+        try:
+            min_val = extracted.get("min")
+            max_val = extracted.get("max")
+            interval_str = extracted.get("type", "").lower()
+            interval_map: dict[str, CompensationInterval] = {
+                "yearly": CompensationInterval.YEARLY,
+                "monthly": CompensationInterval.MONTHLY,
+                "weekly": CompensationInterval.WEEKLY,
+                "daily": CompensationInterval.DAILY,
+                "hourly": CompensationInterval.HOURLY,
+            }
+            interval = interval_map.get(interval_str)
+            if min_val is not None or max_val is not None:
+                return Compensation(
+                    interval=interval,
+                    min_amount=float(min_val) if min_val is not None else None,
+                    max_amount=float(max_val) if max_val is not None else None,
+                )
+        except (ValueError, TypeError):
+            pass
+    # Try salarySnippet (text snippet)
+    snippet = job_data.get("salarySnippet", {})
+    if isinstance(snippet, dict):
+        text = snippet.get("text", "")
+    else:
+        text = str(snippet) if snippet else ""
+    if text:
+        try:
+            interval_str, min_val, max_val, currency = extract_salary(text)
+            if min_val is not None:
+                interval = CompensationInterval(interval_str) if interval_str else None
+                return Compensation(
+                    interval=interval,
+                    min_amount=min_val,
+                    max_amount=max_val,
+                    currency=currency,
+                )
+        except (ValueError, TypeError):
+            pass
+    return None
+def parse_location(raw: str) -> Location:
+    """Parse an Indeed location string into a Location model.
+    Handles formats like:
+    - "Bangalore, Karnataka"
+    - "Remote in Mumbai, Maharashtra"
+    - "Hyderabad, Telangana 500001"
+    """
+    if not raw:
+        return Location()
+    # Strip "Remote in " prefix
+    clean = re.sub(r"^(?:remote\s+in\s+)", "", raw.strip(), flags=re.IGNORECASE)
+    # Remove postal code (5–6 digits at end)
+    clean = re.sub(r"\s*\d{5,6}\s*$", "", clean).strip()
+    parts = [p.strip() for p in clean.split(",") if p.strip()]
+    if len(parts) >= 2:
+        return Location(city=parts[0], state=parts[1])
+    elif len(parts) == 1:
+        return Location(city=parts[0])
+    return Location()
+def get_job_detail_url(job_key: str, country: Country) -> str:
+    """Build the full URL for an Indeed job detail page.
+    Args:
+        job_key: The Indeed job key (e.g. 'abc123def456').
+        country: Country enum value (e.g. Country.INDIA).
+    Returns:
+        Full URL string for the job detail page.
+    """
+    from jobscraper.indeed.constant import BASE_URL
+    base = BASE_URL.format(country=country.value)
+    return f"{base}/viewjob?jk={job_key}"
+def extract_emails(html: str) -> list[str]:
+    """Extract email addresses from Indeed job detail HTML.
+    Args:
+        html: Raw HTML string from a job detail page.
+    Returns:
+        List of unique email addresses found.
+    """
+    soup = BeautifulSoup(html, "lxml")
+    text = soup.get_text()
+    return list(set(extract_emails_from_text(text)))

jobscraper/linkedin/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+"""LinkedIn scraper package."""
+from jobscraper.linkedin._scraper import LinkedInScraper
+__all__ = ["LinkedInScraper"]

jobscraper/linkedin/_scraper.py ADDED Viewed

@@ -0,0 +1,283 @@
+"""LinkedIn scraper implementation."""
+from __future__ import annotations
+import random
+import time
+from datetime import datetime
+from typing import Any
+from jobscraper.exception import LinkedInException
+from jobscraper.linkedin.constant import (
+    BASE_URL,
+    JOB_DETAIL_URL,
+    JOB_TYPE_MAP,
+    LINKEDIN_HEADERS,
+    PAGE_SIZE,
+    VOYAGER_DECORATION,
+    VOYAGER_HEADERS,
+    VOYAGER_JOB_URL,
+)
+from jobscraper.linkedin.util import (
+    build_search_params,
+    parse_compensation,
+    parse_date,
+    parse_html_detail,
+    parse_location,
+    parse_search_html,
+    parse_voyager_job,
+)
+from jobscraper.model import JobPost, JobResponse, JobType, Scraper, ScraperInput, Site
+from jobscraper.util import create_logger, create_session, markdown_converter
+logger = create_logger("linkedin")
+class LinkedInScraper(Scraper):
+    """Scraper for linkedin.com job listings.
+    Uses public HTML search (no auth required) to discover jobs. When
+    ``cookies["li_at"]`` is supplied, fetches rich detail data from
+    LinkedIn's internal Voyager API. Falls back to HTML detail page otherwise.
+    """
+    def scrape(self, scraper_input: ScraperInput) -> JobResponse:
+        """Fetch job listings from LinkedIn and return as a JobResponse.
+        Args:
+            scraper_input: Validated scraper configuration.
+        Returns:
+            JobResponse containing all collected JobPost objects.
+        Raises:
+            LinkedInException: On unrecoverable HTTP errors.
+        """
+        session = create_session(
+            proxies=scraper_input.proxies,
+            ca_cert=scraper_input.ca_cert,
+            is_tls=True,
+        )
+        headers = dict(LINKEDIN_HEADERS)
+        if scraper_input.user_agent:
+            headers["User-Agent"] = scraper_input.user_agent
+        li_at = (scraper_input.cookies or {}).get("li_at")
+        # Warmup — acquire JSESSIONID from homepage cookies
+        jsessionid: str | None = None
+        try:
+            warmup_resp = session.get(BASE_URL, headers=headers)
+            cookies = getattr(warmup_resp, "cookies", {})
+            raw_jsid = cookies.get("JSESSIONID") or ""
+            jsessionid = raw_jsid.strip('"') if raw_jsid else None
+            time.sleep(random.uniform(1.0, 2.0))
+        except Exception:
+            pass
+        # Build authenticated headers when li_at is present
+        voyager_headers: dict[str, str] | None = None
+        if li_at:
+            cookie_str = f"li_at={li_at}"
+            if jsessionid:
+                cookie_str += f'; JSESSIONID="{jsessionid}"'
+            headers["Cookie"] = cookie_str
+            if jsessionid:
+                headers["Csrf-Token"] = jsessionid
+            voyager_headers = dict(VOYAGER_HEADERS)
+            voyager_headers["Cookie"] = cookie_str
+            if jsessionid:
+                voyager_headers["Csrf-Token"] = jsessionid
+            if scraper_input.user_agent:
+                voyager_headers["User-Agent"] = scraper_input.user_agent
+        jobs: list[JobPost] = []
+        start = scraper_input.offset
+        while len(jobs) < scraper_input.results_wanted:
+            params = build_search_params(scraper_input, start)
+            logger.info("Fetching LinkedIn jobs start=%d", start)
+            try:
+                response = session.get(
+                    BASE_URL + "/jobs/search/", headers=headers, params=params
+                )
+            except Exception as exc:
+                raise LinkedInException(
+                    f"Failed to fetch LinkedIn search page: {exc}"
+                ) from exc
+            status = getattr(response, "status_code", None)
+            if isinstance(status, int) and status >= 400:
+                raise LinkedInException(
+                    f"LinkedIn returned HTTP {status}. Bot detection may be active."
+                )
+            html = (
+                response.text
+                if hasattr(response, "text")
+                else response.content.decode()
+            )
+            raw_jobs = parse_search_html(html)
+            if not raw_jobs:
+                logger.info("No jobs parsed at start=%d; stopping.", start)
+                break
+            for raw in raw_jobs:
+                if len(jobs) >= scraper_input.results_wanted:
+                    break
+                job = self._build_job_post(
+                    raw, scraper_input, session, headers, voyager_headers
+                )
+                if job:
+                    jobs.append(job)
+            if len(raw_jobs) < PAGE_SIZE:
+                break
+            start += PAGE_SIZE
+            time.sleep(random.uniform(0.5, 2.5))
+        return JobResponse(jobs=jobs)
+    def _build_job_post(
+        self,
+        raw: dict[str, Any],
+        scraper_input: ScraperInput,
+        session: Any,
+        headers: dict[str, str],
+        voyager_headers: dict[str, str] | None,
+    ) -> JobPost | None:
+        """Convert a raw LinkedIn search card dict to a JobPost.
+        Uses Voyager API when voyager_headers are set; falls back to HTML
+        detail page. Field-level try/except prevents partial data from
+        crashing the scraper.
+        """
+        try:
+            job_id = raw.get("id")
+            if not job_id:
+                logger.warning("LinkedIn job missing id, skipping")
+                return None
+            job_url = JOB_DETAIL_URL.format(job_id=job_id)
+            title = raw.get("title")
+            if not title:
+                logger.warning("LinkedIn job %s missing title", job_id)
+                return None
+            company: str | None = raw.get("company")
+            location = parse_location(raw.get("location") or "")
+            date_posted = parse_date(raw.get("date"))
+            description: str | None = None
+            job_type: list[JobType] | None = None
+            is_remote: bool | None = None
+            compensation = None
+            company_url: str | None = None
+            company_logo: str | None = None
+            job_url_direct: str | None = None
+            is_indeed_apply: bool | None = None
+            emails: list[str] | None = None
+            # ---- Voyager path (authenticated) --------------------------------
+            if voyager_headers and scraper_input.fetch_full_description:
+                try:
+                    vurl = VOYAGER_JOB_URL.format(job_id=job_id)
+                    vresp = session.get(
+                        vurl,
+                        headers=voyager_headers,
+                        params={"decorationId": VOYAGER_DECORATION},
+                    )
+                    vstatus = getattr(vresp, "status_code", None)
+                    if isinstance(vstatus, int) and vstatus < 400:
+                        vdata = vresp.json()
+                        parsed = parse_voyager_job(vdata.get("data") or vdata)
+                        title = parsed.get("title") or title
+                        company = parsed.get("company") or company
+                        company_url = parsed.get("company_url")
+                        company_logo = parsed.get("company_logo")
+                        job_url_direct = parsed.get("job_url_direct")
+                        is_indeed_apply = parsed.get("is_easy_apply")
+                        is_remote = parsed.get("is_remote")
+                        emp = parsed.get("employment_status") or ""
+                        jt = JOB_TYPE_MAP.get(emp)
+                        job_type = [jt] if jt else None
+                        loc_str = parsed.get("formatted_location")
+                        if loc_str:
+                            location = parse_location(loc_str)
+                        listed_at = parsed.get("listed_at")
+                        if listed_at:
+                            try:
+                                date_posted = datetime.fromtimestamp(
+                                    int(listed_at) / 1000
+                                ).date()
+                            except (ValueError, OSError):
+                                pass
+                        compensation = parse_compensation(parsed.get("salary"))
+                        raw_desc = parsed.get("description_html")
+                        if raw_desc:
+                            description = (
+                                markdown_converter(raw_desc)
+                                if scraper_input.description_format == "markdown"
+                                else raw_desc
+                            )
+                        time.sleep(random.uniform(0.5, 2.5))
+                except Exception as exc:
+                    logger.warning(
+                        "Job %s: Voyager fetch failed (%s); falling back to HTML",
+                        job_id,
+                        exc,
+                    )
+            # ---- HTML fallback (no cookie or Voyager failed) -----------------
+            if description is None and scraper_input.fetch_full_description:
+                try:
+                    detail_resp = session.get(job_url, headers=headers)
+                    detail_html = (
+                        detail_resp.text
+                        if hasattr(detail_resp, "text")
+                        else detail_resp.content.decode()
+                    )
+                    description, job_url_direct, emails = parse_html_detail(
+                        detail_html, scraper_input.description_format
+                    )
+                    time.sleep(random.uniform(0.5, 2.5))
+                except Exception as exc:
+                    logger.warning(
+                        "Job %s: HTML detail fetch failed: %s", job_id, exc
+                    )
+            return JobPost(
+                id=str(job_id),
+                site=Site.LINKEDIN,
+                job_url=job_url,
+                job_url_direct=job_url_direct,
+                title=title,
+                company=company,
+                location=location,
+                date_posted=date_posted,
+                job_type=job_type,
+                compensation=compensation,
+                is_remote=is_remote,
+                is_indeed_apply=is_indeed_apply,
+                description=description,
+                emails=emails,
+                company_url=company_url,
+                company_logo=company_logo,
+            )
+        except Exception as exc:
+            logger.warning("Unexpected error building LinkedIn JobPost: %s", exc)
+            return None

jobscraper/linkedin/constant.py ADDED Viewed

@@ -0,0 +1,60 @@
+"""Constants for the LinkedIn scraper."""
+from __future__ import annotations
+from jobscraper.model import JobType
+BASE_URL = "https://www.linkedin.com"
+JOBS_SEARCH_URL = BASE_URL + "/jobs/search/"
+JOB_DETAIL_URL = BASE_URL + "/jobs/view/{job_id}/"
+VOYAGER_JOB_URL = BASE_URL + "/voyager/api/jobs/jobPostings/{job_id}"
+VOYAGER_DECORATION = "com.linkedin.voyager.deco.jobs.web.shared.WebFullJobPosting-65"
+LINKEDIN_HEADERS: dict[str, str] = {
+    "User-Agent": (
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
+        "AppleWebKit/537.36 (KHTML, like Gecko) "
+        "Chrome/120.0.0.0 Safari/537.36"
+    ),
+    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+    "Accept-Language": "en-US,en;q=0.9",
+    "Referer": BASE_URL + "/",
+}
+VOYAGER_HEADERS: dict[str, str] = {
+    "Accept": "application/vnd.linkedin.normalized+json+2.1",
+    "Accept-Language": "en-US,en;q=0.9",
+    "X-RestLi-Protocol-Version": "2.0.0",
+    "X-Li-Track": (
+        '{"clientVersion":"1.13.1665","mpVersion":"1.13.1665","osName":"web",'
+        '"timezoneOffset":5.5,"timezone":"Asia/Calcutta","deviceFormFactor":"DESKTOP",'
+        '"mpName":"voyager-web","displayDensity":2,"displayWidth":1920,"displayHeight":1080}'
+    ),
+}
+# LinkedIn URL filter code → JobType enum
+JOB_TYPE_MAP: dict[str, JobType] = {
+    "full-time": JobType.FULL_TIME,
+    "full_time": JobType.FULL_TIME,
+    "f": JobType.FULL_TIME,
+    "part-time": JobType.PART_TIME,
+    "part_time": JobType.PART_TIME,
+    "p": JobType.PART_TIME,
+    "contract": JobType.CONTRACT,
+    "c": JobType.CONTRACT,
+    "temporary": JobType.TEMPORARY,
+    "t": JobType.TEMPORARY,
+    "internship": JobType.INTERNSHIP,
+    "i": JobType.INTERNSHIP,
+}
+# JobType enum value → LinkedIn URL filter code (for search params)
+JOB_TYPE_FILTER: dict[str, str] = {
+    "fulltime": "F",
+    "parttime": "P",
+    "contract": "C",
+    "temporary": "T",
+    "internship": "I",
+}
+PAGE_SIZE = 25  # LinkedIn returns up to 25 results per search page