LinkedInWebScraper 1.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. linkedinwebscraper-1.1.1/.dockerignore +19 -0
  2. linkedinwebscraper-1.1.1/Dockerfile +24 -0
  3. linkedinwebscraper-1.1.1/LICENSE +21 -0
  4. linkedinwebscraper-1.1.1/MANIFEST.in +6 -0
  5. linkedinwebscraper-1.1.1/PKG-INFO +155 -0
  6. linkedinwebscraper-1.1.1/README.md +107 -0
  7. linkedinwebscraper-1.1.1/examples/example.py +15 -0
  8. linkedinwebscraper-1.1.1/examples/example_advanced_config.py +38 -0
  9. linkedinwebscraper-1.1.1/examples/example_openai.py +21 -0
  10. linkedinwebscraper-1.1.1/pyproject.toml +134 -0
  11. linkedinwebscraper-1.1.1/runtime.example.toml +35 -0
  12. linkedinwebscraper-1.1.1/setup.cfg +4 -0
  13. linkedinwebscraper-1.1.1/setup.py +3 -0
  14. linkedinwebscraper-1.1.1/src/LinkedInWebScraper.egg-info/PKG-INFO +155 -0
  15. linkedinwebscraper-1.1.1/src/LinkedInWebScraper.egg-info/SOURCES.txt +82 -0
  16. linkedinwebscraper-1.1.1/src/LinkedInWebScraper.egg-info/dependency_links.txt +1 -0
  17. linkedinwebscraper-1.1.1/src/LinkedInWebScraper.egg-info/entry_points.txt +2 -0
  18. linkedinwebscraper-1.1.1/src/LinkedInWebScraper.egg-info/requires.txt +28 -0
  19. linkedinwebscraper-1.1.1/src/LinkedInWebScraper.egg-info/top_level.txt +1 -0
  20. linkedinwebscraper-1.1.1/src/linkedin_web_scraper/__init__.py +108 -0
  21. linkedinwebscraper-1.1.1/src/linkedin_web_scraper/application/__init__.py +24 -0
  22. linkedinwebscraper-1.1.1/src/linkedin_web_scraper/application/daily_scrape_service.py +203 -0
  23. linkedinwebscraper-1.1.1/src/linkedin_web_scraper/application/linkedin_job_scraper.py +203 -0
  24. linkedinwebscraper-1.1.1/src/linkedin_web_scraper/application/release_manager.py +138 -0
  25. linkedinwebscraper-1.1.1/src/linkedin_web_scraper/application/runtime_runner.py +155 -0
  26. linkedinwebscraper-1.1.1/src/linkedin_web_scraper/application/storage.py +56 -0
  27. linkedinwebscraper-1.1.1/src/linkedin_web_scraper/config/__init__.py +53 -0
  28. linkedinwebscraper-1.1.1/src/linkedin_web_scraper/config/constants.py +17 -0
  29. linkedinwebscraper-1.1.1/src/linkedin_web_scraper/config/job_scraper_advanced_config.py +44 -0
  30. linkedinwebscraper-1.1.1/src/linkedin_web_scraper/config/job_scraper_config.py +53 -0
  31. linkedinwebscraper-1.1.1/src/linkedin_web_scraper/config/job_scraper_config_factory.py +38 -0
  32. linkedinwebscraper-1.1.1/src/linkedin_web_scraper/config/keywords.py +21 -0
  33. linkedinwebscraper-1.1.1/src/linkedin_web_scraper/config/locations.py +47 -0
  34. linkedinwebscraper-1.1.1/src/linkedin_web_scraper/config/openai.py +5 -0
  35. linkedinwebscraper-1.1.1/src/linkedin_web_scraper/config/options.py +24 -0
  36. linkedinwebscraper-1.1.1/src/linkedin_web_scraper/config/runtime.py +288 -0
  37. linkedinwebscraper-1.1.1/src/linkedin_web_scraper/config/storage.py +21 -0
  38. linkedinwebscraper-1.1.1/src/linkedin_web_scraper/config/tech_stack.py +208 -0
  39. linkedinwebscraper-1.1.1/src/linkedin_web_scraper/config/time_filters.py +5 -0
  40. linkedinwebscraper-1.1.1/src/linkedin_web_scraper/config/user_agents.py +21 -0
  41. linkedinwebscraper-1.1.1/src/linkedin_web_scraper/domain/__init__.py +6 -0
  42. linkedinwebscraper-1.1.1/src/linkedin_web_scraper/domain/job_data_cleaner.py +364 -0
  43. linkedinwebscraper-1.1.1/src/linkedin_web_scraper/domain/job_title_classifier.py +66 -0
  44. linkedinwebscraper-1.1.1/src/linkedin_web_scraper/infra/__init__.py +22 -0
  45. linkedinwebscraper-1.1.1/src/linkedin_web_scraper/infra/http/__init__.py +7 -0
  46. linkedinwebscraper-1.1.1/src/linkedin_web_scraper/infra/http/job_scraper.py +252 -0
  47. linkedinwebscraper-1.1.1/src/linkedin_web_scraper/infra/http/policy.py +75 -0
  48. linkedinwebscraper-1.1.1/src/linkedin_web_scraper/infra/http/utils.py +101 -0
  49. linkedinwebscraper-1.1.1/src/linkedin_web_scraper/infra/logging.py +119 -0
  50. linkedinwebscraper-1.1.1/src/linkedin_web_scraper/infra/openai/__init__.py +23 -0
  51. linkedinwebscraper-1.1.1/src/linkedin_web_scraper/infra/openai/job_description_processor.py +94 -0
  52. linkedinwebscraper-1.1.1/src/linkedin_web_scraper/infra/openai/models.py +78 -0
  53. linkedinwebscraper-1.1.1/src/linkedin_web_scraper/infra/openai/openai_handler.py +190 -0
  54. linkedinwebscraper-1.1.1/src/linkedin_web_scraper/infra/paths.py +63 -0
  55. linkedinwebscraper-1.1.1/src/linkedin_web_scraper/infra/storage/__init__.py +6 -0
  56. linkedinwebscraper-1.1.1/src/linkedin_web_scraper/infra/storage/file_manager.py +87 -0
  57. linkedinwebscraper-1.1.1/src/linkedin_web_scraper/infra/storage/models.py +110 -0
  58. linkedinwebscraper-1.1.1/src/linkedin_web_scraper/infra/storage/sqlite.py +271 -0
  59. linkedinwebscraper-1.1.1/src/linkedin_web_scraper/interfaces/__init__.py +3 -0
  60. linkedinwebscraper-1.1.1/src/linkedin_web_scraper/interfaces/cli/__init__.py +5 -0
  61. linkedinwebscraper-1.1.1/src/linkedin_web_scraper/interfaces/cli/main.py +197 -0
  62. linkedinwebscraper-1.1.1/tests/test_cli_main.py +96 -0
  63. linkedinwebscraper-1.1.1/tests/test_config_models.py +77 -0
  64. linkedinwebscraper-1.1.1/tests/test_daily_scrape_service.py +213 -0
  65. linkedinwebscraper-1.1.1/tests/test_example_advanced_config_smoke.py +58 -0
  66. linkedinwebscraper-1.1.1/tests/test_example_openai_smoke.py +61 -0
  67. linkedinwebscraper-1.1.1/tests/test_example_smoke.py +51 -0
  68. linkedinwebscraper-1.1.1/tests/test_file_manager.py +147 -0
  69. linkedinwebscraper-1.1.1/tests/test_github_workflows.py +101 -0
  70. linkedinwebscraper-1.1.1/tests/test_http_policy.py +201 -0
  71. linkedinwebscraper-1.1.1/tests/test_internal_codex_docs.py +34 -0
  72. linkedinwebscraper-1.1.1/tests/test_job_data_cleaner_transformations.py +94 -0
  73. linkedinwebscraper-1.1.1/tests/test_job_scraper_fixture.py +117 -0
  74. linkedinwebscraper-1.1.1/tests/test_linkedin_job_scraper.py +244 -0
  75. linkedinwebscraper-1.1.1/tests/test_location_normalization.py +72 -0
  76. linkedinwebscraper-1.1.1/tests/test_logging.py +74 -0
  77. linkedinwebscraper-1.1.1/tests/test_main_smoke.py +29 -0
  78. linkedinwebscraper-1.1.1/tests/test_openai_enrichment.py +157 -0
  79. linkedinwebscraper-1.1.1/tests/test_paths.py +69 -0
  80. linkedinwebscraper-1.1.1/tests/test_process_ds_jobs_smoke.py +72 -0
  81. linkedinwebscraper-1.1.1/tests/test_release_manager.py +76 -0
  82. linkedinwebscraper-1.1.1/tests/test_runtime_config.py +116 -0
  83. linkedinwebscraper-1.1.1/tests/test_runtime_runner.py +128 -0
  84. linkedinwebscraper-1.1.1/tests/test_sqlite_scrape_storage.py +155 -0
@@ -0,0 +1,19 @@
1
+ .git
2
+ .gitignore
3
+ .github
4
+ .pytest_cache
5
+ .ruff_cache
6
+ .tox
7
+ .venv
8
+ .tmp
9
+ artifacts
10
+ build
11
+ dist
12
+ site
13
+ __pycache__
14
+ *.pyc
15
+ *.pyo
16
+ *.pyd
17
+ *.sqlite
18
+ *.db
19
+ *.log
@@ -0,0 +1,24 @@
1
+ FROM python:3.12-slim
2
+
3
+ ENV PYTHONDONTWRITEBYTECODE=1 \
4
+ PYTHONUNBUFFERED=1
5
+
6
+ WORKDIR /app
7
+
8
+ ARG INSTALL_EXTRAS=""
9
+
10
+ RUN apt-get update \
11
+ && apt-get install -y --no-install-recommends build-essential \
12
+ && rm -rf /var/lib/apt/lists/*
13
+
14
+ COPY LICENSE README.md MANIFEST.in pyproject.toml setup.py ./
15
+ COPY src ./src
16
+ COPY main.py process_ds_jobs.py runtime.example.toml ./
17
+
18
+ RUN python -m pip install --upgrade pip \
19
+ && if [ -n "$INSTALL_EXTRAS" ]; then pip install ".[$INSTALL_EXTRAS]"; else pip install .; fi
20
+
21
+ VOLUME ["/app/artifacts"]
22
+
23
+ ENTRYPOINT ["linkedin-webscraper"]
24
+ CMD ["scrape", "daily"]
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Ricardo García Ramírez
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,6 @@
1
+ include README.md
2
+ include LICENSE
3
+ include runtime.example.toml
4
+ include Dockerfile
5
+ include .dockerignore
6
+ recursive-include examples *.py
@@ -0,0 +1,155 @@
1
+ Metadata-Version: 2.4
2
+ Name: LinkedInWebScraper
3
+ Version: 1.1.1
4
+ Summary: A library for scraping LinkedIn job postings.
5
+ Author-email: Ricardo Garcia Ramirez <rgr.5882@gmail.com>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/ricardogr07/LinkedInWebScraper
8
+ Project-URL: Source, https://github.com/ricardogr07/LinkedInWebScraper
9
+ Keywords: linkedin,scraper,jobs,openai,pandas,sqlite
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Programming Language :: Python :: 3.13
16
+ Classifier: Programming Language :: Python :: 3.14
17
+ Classifier: Topic :: Internet :: WWW/HTTP
18
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
19
+ Requires-Python: >=3.11
20
+ Description-Content-Type: text/markdown
21
+ License-File: LICENSE
22
+ Requires-Dist: beautifulsoup4>=4.12.3
23
+ Requires-Dist: numpy>=1.26.4
24
+ Requires-Dist: pandas>=2.2.0
25
+ Requires-Dist: requests>=2.31.0
26
+ Requires-Dist: SQLAlchemy>=2.0.36
27
+ Provides-Extra: openai
28
+ Requires-Dist: openai>=2.29.0; extra == "openai"
29
+ Requires-Dist: pydantic>=2.7.0; extra == "openai"
30
+ Provides-Extra: dev
31
+ Requires-Dist: build>=1.2.2; extra == "dev"
32
+ Requires-Dist: coverage[toml]>=7.6.0; extra == "dev"
33
+ Requires-Dist: mkdocs>=1.6.1; extra == "dev"
34
+ Requires-Dist: mkdocs-material>=9.5.34; extra == "dev"
35
+ Requires-Dist: mkdocstrings[python]>=0.26.1; extra == "dev"
36
+ Requires-Dist: openai>=2.29.0; extra == "dev"
37
+ Requires-Dist: pydantic>=2.7.0; extra == "dev"
38
+ Requires-Dist: pytest>=8.3.2; extra == "dev"
39
+ Requires-Dist: pytest-cov>=5.0.0; extra == "dev"
40
+ Requires-Dist: pyrefly>=0.26.0; extra == "dev"
41
+ Requires-Dist: ruff>=0.6.9; extra == "dev"
42
+ Requires-Dist: tox>=4.21.2; extra == "dev"
43
+ Provides-Extra: docs
44
+ Requires-Dist: mkdocs>=1.6.1; extra == "docs"
45
+ Requires-Dist: mkdocs-material>=9.5.34; extra == "docs"
46
+ Requires-Dist: mkdocstrings[python]>=0.26.1; extra == "docs"
47
+ Dynamic: license-file
48
+
49
+ # LinkedInWebScraper
50
+
51
+ [![CI](https://github.com/ricardogr07/LinkedInWebScraper/actions/workflows/ci.yml/badge.svg)](https://github.com/ricardogr07/LinkedInWebScraper/actions/workflows/ci.yml)
52
+ [![Docs](https://github.com/ricardogr07/LinkedInWebScraper/actions/workflows/docs.yml/badge.svg)](https://github.com/ricardogr07/LinkedInWebScraper/actions/workflows/docs.yml)
53
+ [![Docs site](https://img.shields.io/badge/docs-GitHub%20Pages-blue)](https://ricardogr07.github.io/LinkedInWebScraper/)
54
+ [![Release](https://github.com/ricardogr07/LinkedInWebScraper/actions/workflows/release.yml/badge.svg)](https://github.com/ricardogr07/LinkedInWebScraper/actions/workflows/release.yml)
55
+ [![PyPI version](https://img.shields.io/pypi/v/LinkedInWebScraper.svg)](https://pypi.org/project/LinkedInWebScraper/)
56
+ [![Python versions](https://img.shields.io/pypi/pyversions/LinkedInWebScraper.svg)](https://pypi.org/project/LinkedInWebScraper/)
57
+ [![License](https://img.shields.io/pypi/l/LinkedInWebScraper.svg)](https://github.com/ricardogr07/LinkedInWebScraper/blob/main/LICENSE)
58
+
59
+ LinkedInWebScraper is a production-minded Python library and scheduled job runner for collecting LinkedIn job listings, normalizing the data, persisting run history, and exporting reusable datasets.
60
+
61
+ ## Highlights
62
+
63
+ - Canonical package namespace under `linkedin_web_scraper`
64
+ - Typed programmatic config for single scrapes and TOML runtime config for CLI and scheduled runs
65
+ - Managed artifacts under `artifacts/jobs`, `artifacts/logs`, and `artifacts/state`
66
+ - SQLite-backed persistence through a clean application storage port
67
+ - Package CLI with `scrape once`, `scrape daily`, `export`, and `--dry-run`
68
+ - Optional OpenAI enrichment built on the current Responses API
69
+ - Runnable examples under `examples/`
70
+ - Auto release automation that waits for green CI and Docs runs on `main`
71
+
72
+ ## Install
73
+
74
+ ```bash
75
+ pip install LinkedInWebScraper
76
+ pip install LinkedInWebScraper[openai]
77
+ pip install -e .[dev]
78
+ ```
79
+
80
+ ## Quickstart
81
+
82
+ ```python
83
+ from linkedin_web_scraper import (
84
+ JobScraperConfig,
85
+ LinkedInJobScraper,
86
+ RemoteType,
87
+ configure_logging,
88
+ )
89
+
90
+ logger = configure_logging(filename="example.log")
91
+ config = JobScraperConfig(
92
+ position="Data Analyst",
93
+ location="San Francisco",
94
+ remote=RemoteType.REMOTE,
95
+ )
96
+
97
+ jobs = LinkedInJobScraper(logger=logger, config=config).run()
98
+ print(jobs.head())
99
+ ```
100
+
101
+ ## Examples
102
+
103
+ Run the example scripts from `examples/`:
104
+
105
+ ```bash
106
+ python examples/example.py
107
+ python examples/example_advanced_config.py
108
+ python examples/example_openai.py
109
+ ```
110
+
111
+ The OpenAI example requires `OPENAI_API_KEY` in the environment.
112
+
113
+ ## CLI Runtime
114
+
115
+ ```bash
116
+ linkedin-webscraper scrape once --dry-run
117
+ linkedin-webscraper scrape daily
118
+ linkedin-webscraper export --run-id <run-id>
119
+ ```
120
+
121
+ Use `runtime.example.toml` as the template for a real `runtime.toml`. The root runtime scripts remain available for the daily and once workflows:
122
+
123
+ ```bash
124
+ python main.py
125
+ python process_ds_jobs.py
126
+ ```
127
+
128
+ ## Docs
129
+
130
+ - [Getting Started](docs/getting-started.md)
131
+ - [Configuration](docs/configuration.md)
132
+ - [Runtime and Deployment](docs/runtime.md)
133
+ - [Release and Automation](docs/development/release-and-automation.md)
134
+ - [Validation](docs/development/validation.md)
135
+ - [API Reference](docs/api.md)
136
+
137
+ ## Development
138
+
139
+ Run the local gate before risky pushes or merges:
140
+
141
+ ```bash
142
+ python -m tox -e preflight
143
+ ```
144
+
145
+ For a faster smoke-only path:
146
+
147
+ ```bash
148
+ python -m tox -e smoke
149
+ ```
150
+
151
+ The detailed validation matrix and release flow live in `docs/development/validation.md` and `docs/development/release-and-automation.md`.
152
+
153
+ ## License
154
+
155
+ This project is licensed under the MIT License.
@@ -0,0 +1,107 @@
1
+ # LinkedInWebScraper
2
+
3
+ [![CI](https://github.com/ricardogr07/LinkedInWebScraper/actions/workflows/ci.yml/badge.svg)](https://github.com/ricardogr07/LinkedInWebScraper/actions/workflows/ci.yml)
4
+ [![Docs](https://github.com/ricardogr07/LinkedInWebScraper/actions/workflows/docs.yml/badge.svg)](https://github.com/ricardogr07/LinkedInWebScraper/actions/workflows/docs.yml)
5
+ [![Docs site](https://img.shields.io/badge/docs-GitHub%20Pages-blue)](https://ricardogr07.github.io/LinkedInWebScraper/)
6
+ [![Release](https://github.com/ricardogr07/LinkedInWebScraper/actions/workflows/release.yml/badge.svg)](https://github.com/ricardogr07/LinkedInWebScraper/actions/workflows/release.yml)
7
+ [![PyPI version](https://img.shields.io/pypi/v/LinkedInWebScraper.svg)](https://pypi.org/project/LinkedInWebScraper/)
8
+ [![Python versions](https://img.shields.io/pypi/pyversions/LinkedInWebScraper.svg)](https://pypi.org/project/LinkedInWebScraper/)
9
+ [![License](https://img.shields.io/pypi/l/LinkedInWebScraper.svg)](https://github.com/ricardogr07/LinkedInWebScraper/blob/main/LICENSE)
10
+
11
+ LinkedInWebScraper is a production-minded Python library and scheduled job runner for collecting LinkedIn job listings, normalizing the data, persisting run history, and exporting reusable datasets.
12
+
13
+ ## Highlights
14
+
15
+ - Canonical package namespace under `linkedin_web_scraper`
16
+ - Typed programmatic config for single scrapes and TOML runtime config for CLI and scheduled runs
17
+ - Managed artifacts under `artifacts/jobs`, `artifacts/logs`, and `artifacts/state`
18
+ - SQLite-backed persistence through a clean application storage port
19
+ - Package CLI with `scrape once`, `scrape daily`, `export`, and `--dry-run`
20
+ - Optional OpenAI enrichment built on the current Responses API
21
+ - Runnable examples under `examples/`
22
+ - Auto release automation that waits for green CI and Docs runs on `main`
23
+
24
+ ## Install
25
+
26
+ ```bash
27
+ pip install LinkedInWebScraper
28
+ pip install LinkedInWebScraper[openai]
29
+ pip install -e .[dev]
30
+ ```
31
+
32
+ ## Quickstart
33
+
34
+ ```python
35
+ from linkedin_web_scraper import (
36
+ JobScraperConfig,
37
+ LinkedInJobScraper,
38
+ RemoteType,
39
+ configure_logging,
40
+ )
41
+
42
+ logger = configure_logging(filename="example.log")
43
+ config = JobScraperConfig(
44
+ position="Data Analyst",
45
+ location="San Francisco",
46
+ remote=RemoteType.REMOTE,
47
+ )
48
+
49
+ jobs = LinkedInJobScraper(logger=logger, config=config).run()
50
+ print(jobs.head())
51
+ ```
52
+
53
+ ## Examples
54
+
55
+ Run the example scripts from `examples/`:
56
+
57
+ ```bash
58
+ python examples/example.py
59
+ python examples/example_advanced_config.py
60
+ python examples/example_openai.py
61
+ ```
62
+
63
+ The OpenAI example requires `OPENAI_API_KEY` in the environment.
64
+
65
+ ## CLI Runtime
66
+
67
+ ```bash
68
+ linkedin-webscraper scrape once --dry-run
69
+ linkedin-webscraper scrape daily
70
+ linkedin-webscraper export --run-id <run-id>
71
+ ```
72
+
73
+ Use `runtime.example.toml` as the template for a real `runtime.toml`. The root runtime scripts remain available for the daily and once workflows:
74
+
75
+ ```bash
76
+ python main.py
77
+ python process_ds_jobs.py
78
+ ```
79
+
80
+ ## Docs
81
+
82
+ - [Getting Started](docs/getting-started.md)
83
+ - [Configuration](docs/configuration.md)
84
+ - [Runtime and Deployment](docs/runtime.md)
85
+ - [Release and Automation](docs/development/release-and-automation.md)
86
+ - [Validation](docs/development/validation.md)
87
+ - [API Reference](docs/api.md)
88
+
89
+ ## Development
90
+
91
+ Run the local gate before risky pushes or merges:
92
+
93
+ ```bash
94
+ python -m tox -e preflight
95
+ ```
96
+
97
+ For a faster smoke-only path:
98
+
99
+ ```bash
100
+ python -m tox -e smoke
101
+ ```
102
+
103
+ The detailed validation matrix and release flow live in `docs/development/validation.md` and `docs/development/release-and-automation.md`.
104
+
105
+ ## License
106
+
107
+ This project is licensed under the MIT License.
@@ -0,0 +1,15 @@
1
+ from linkedin_web_scraper import JobScraperConfig, LinkedInJobScraper, Logger
2
+
3
+ logger = Logger("example.log")
4
+
5
+ # Define scraper configuration
6
+ config = JobScraperConfig(position="Data Analyst", location="San Francisco", remote="REMOTE")
7
+
8
+ # Initialize the scraper
9
+ scraper = LinkedInJobScraper(logger, config)
10
+
11
+ # Scrape job data
12
+ job_data = scraper.run()
13
+
14
+ # View the results
15
+ print(job_data.head())
@@ -0,0 +1,38 @@
1
+ from linkedin_web_scraper import (
2
+ JobScraperAdvancedConfig,
3
+ JobScraperConfig,
4
+ LinkedInJobScraper,
5
+ Logger,
6
+ )
7
+
8
+ logger = Logger("example_advanced_config.log")
9
+
10
+ KEYWORDS_LIST = [
11
+ "data",
12
+ "analytics",
13
+ "business intelligence",
14
+ "bi",
15
+ "statistical",
16
+ "statistics",
17
+ "analysis",
18
+ "power bi",
19
+ ]
20
+
21
+ advanced_config = JobScraperAdvancedConfig(KEYWORDS=KEYWORDS_LIST)
22
+
23
+ # Define scraper configuration
24
+ config = JobScraperConfig(
25
+ position="Data Analyst",
26
+ location="San Francisco",
27
+ remote="REMOTE",
28
+ advanced_config=advanced_config,
29
+ )
30
+
31
+ # Initialize the scraper
32
+ scraper = LinkedInJobScraper(logger, config)
33
+
34
+ # Scrape job data
35
+ job_data = scraper.run()
36
+
37
+ # View the results
38
+ print(job_data.head())
@@ -0,0 +1,21 @@
1
+ from linkedin_web_scraper import JobScraperConfig, LinkedInJobScraper, Logger
2
+
3
+ logger = Logger("example_openai.log")
4
+
5
+ # Define scraper configuration
6
+ config = JobScraperConfig(
7
+ position="Data Analyst",
8
+ location="San Francisco",
9
+ remote="REMOTE",
10
+ openai_enabled=True,
11
+ openai_model="gpt-4o-mini",
12
+ )
13
+
14
+ # Initialize the scraper
15
+ scraper = LinkedInJobScraper(logger, config)
16
+
17
+ # Scrape job data
18
+ job_data = scraper.run()
19
+
20
+ # View the results
21
+ print(job_data.head())
@@ -0,0 +1,134 @@
1
+ [build-system]
2
+ requires = ["setuptools>=69", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "LinkedInWebScraper"
7
+ version = "1.1.1"
8
+ description = "A library for scraping LinkedIn job postings."
9
+ readme = "README.md"
10
+ requires-python = ">=3.11"
11
+ license = "MIT"
12
+ license-files = ["LICENSE"]
13
+ authors = [
14
+ { name = "Ricardo Garcia Ramirez", email = "rgr.5882@gmail.com" },
15
+ ]
16
+ keywords = ["linkedin", "scraper", "jobs", "openai", "pandas", "sqlite"]
17
+ classifiers = [
18
+ "Development Status :: 3 - Alpha",
19
+ "Intended Audience :: Developers",
20
+ "Programming Language :: Python :: 3",
21
+ "Programming Language :: Python :: 3.11",
22
+ "Programming Language :: Python :: 3.12",
23
+ "Programming Language :: Python :: 3.13",
24
+ "Programming Language :: Python :: 3.14",
25
+ "Topic :: Internet :: WWW/HTTP",
26
+ "Topic :: Software Development :: Libraries :: Python Modules",
27
+ ]
28
+ dependencies = [
29
+ "beautifulsoup4>=4.12.3",
30
+ "numpy>=1.26.4",
31
+ "pandas>=2.2.0",
32
+ "requests>=2.31.0",
33
+ "SQLAlchemy>=2.0.36",
34
+ ]
35
+
36
+ [project.optional-dependencies]
37
+ openai = [
38
+ "openai>=2.29.0",
39
+ "pydantic>=2.7.0",
40
+ ]
41
+ dev = [
42
+ "build>=1.2.2",
43
+ "coverage[toml]>=7.6.0",
44
+ "mkdocs>=1.6.1",
45
+ "mkdocs-material>=9.5.34",
46
+ "mkdocstrings[python]>=0.26.1",
47
+ "openai>=2.29.0",
48
+ "pydantic>=2.7.0",
49
+ "pytest>=8.3.2",
50
+ "pytest-cov>=5.0.0",
51
+ "pyrefly>=0.26.0",
52
+ "ruff>=0.6.9",
53
+ "tox>=4.21.2",
54
+ ]
55
+ docs = [
56
+ "mkdocs>=1.6.1",
57
+ "mkdocs-material>=9.5.34",
58
+ "mkdocstrings[python]>=0.26.1",
59
+ ]
60
+
61
+ [project.urls]
62
+ Homepage = "https://github.com/ricardogr07/LinkedInWebScraper"
63
+ Source = "https://github.com/ricardogr07/LinkedInWebScraper"
64
+
65
+ [project.scripts]
66
+ linkedin-webscraper = "linkedin_web_scraper.interfaces.cli.main:main"
67
+
68
+ [tool.setuptools]
69
+ include-package-data = true
70
+ package-dir = { "" = "src" }
71
+
72
+ [tool.setuptools.packages.find]
73
+ where = ["src"]
74
+ include = ["linkedin_web_scraper*"]
75
+
76
+ [tool.pytest.ini_options]
77
+ addopts = "-ra -p no:cacheprovider"
78
+ testpaths = ["tests"]
79
+ markers = [
80
+ "live_linkedin: tests that hit LinkedIn or related live HTTP endpoints",
81
+ "live_openai: tests that hit the OpenAI API",
82
+ ]
83
+
84
+ [tool.ruff]
85
+ line-length = 100
86
+ target-version = "py311"
87
+ src = ["src", "."]
88
+ extend-exclude = ["site", ".tmp", "pytest-cache-files-*"]
89
+
90
+ [tool.ruff.format]
91
+ quote-style = "double"
92
+ indent-style = "space"
93
+
94
+ [tool.ruff.lint]
95
+ select = ["E4", "E7", "E9", "F", "I", "UP", "B", "SIM"]
96
+
97
+ [tool.coverage.run]
98
+ branch = true
99
+ source = ["src"]
100
+
101
+ [tool.coverage.report]
102
+ show_missing = true
103
+ skip_covered = false
104
+
105
+ [tool.pyrefly]
106
+ project-includes = [
107
+ "src/linkedin_web_scraper/config/job_scraper_config.py",
108
+ "src/linkedin_web_scraper/config/job_scraper_advanced_config.py",
109
+ "src/linkedin_web_scraper/config/job_scraper_config_factory.py",
110
+ "src/linkedin_web_scraper/config/openai.py",
111
+ "src/linkedin_web_scraper/config/storage.py",
112
+ "src/linkedin_web_scraper/config/options.py",
113
+ "src/linkedin_web_scraper/config/runtime.py",
114
+ "src/linkedin_web_scraper/application/daily_scrape_service.py",
115
+ "src/linkedin_web_scraper/application/linkedin_job_scraper.py",
116
+ "src/linkedin_web_scraper/application/storage.py",
117
+ "src/linkedin_web_scraper/application/runtime_runner.py",
118
+ "src/linkedin_web_scraper/application/release_manager.py",
119
+ "src/linkedin_web_scraper/domain/job_data_cleaner.py",
120
+ "src/linkedin_web_scraper/domain/job_title_classifier.py",
121
+ "src/linkedin_web_scraper/infra/logging.py",
122
+ "src/linkedin_web_scraper/infra/paths.py",
123
+ "src/linkedin_web_scraper/infra/http/policy.py",
124
+ "src/linkedin_web_scraper/infra/http/utils.py",
125
+ "src/linkedin_web_scraper/infra/http/job_scraper.py",
126
+ "src/linkedin_web_scraper/infra/openai/models.py",
127
+ "src/linkedin_web_scraper/infra/openai/openai_handler.py",
128
+ "src/linkedin_web_scraper/infra/openai/job_description_processor.py",
129
+ "src/linkedin_web_scraper/infra/storage/models.py",
130
+ "src/linkedin_web_scraper/infra/storage/sqlite.py",
131
+ ]
132
+ search-path = ["src"]
133
+ python-version = "3.11"
134
+ use-ignore-files = true
@@ -0,0 +1,35 @@
1
+ # Example runtime configuration for CLI, containers, and scheduled runs.
2
+ # Keep OPENAI_API_KEY in the environment, never in this file.
3
+
4
+ [logging]
5
+ level = "INFO"
6
+ file_name = "main.log"
7
+
8
+ [storage]
9
+ file_name = "linkedin_jobs.sqlite"
10
+ state_dir = "artifacts/state"
11
+
12
+ [scrape.once]
13
+ position = "Data Scientist"
14
+ location = "Monterrey"
15
+ openai_enabled = false
16
+ openai_model = "gpt-4o-mini"
17
+ time_posted = "DAY"
18
+ remote_types = ["REMOTE", "HYBRID", "ON-SITE"]
19
+ file_name = "LinkedIn_Jobs_Data_Scientist_Monterrey.csv"
20
+ output_dir = "artifacts/jobs"
21
+ append = true
22
+
23
+ [scrape.daily]
24
+ cities = ["Monterrey", "Guadalajara", "Mexico City"]
25
+ position = "Data Scientist"
26
+ openai_enabled = false
27
+ openai_model = "gpt-4o-mini"
28
+ time_posted = "DAY"
29
+ output_dir = "artifacts/jobs"
30
+ combined_file_name = "LinkedIn_Jobs_Data_Scientist_Mexico.csv"
31
+
32
+ [export]
33
+ run_id = ""
34
+ file_name = "linkedin_jobs_export.csv"
35
+ output_dir = "artifacts/jobs"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,3 @@
1
+ from setuptools import setup
2
+
3
+ setup()