LinkedInWebScraper 1.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- linkedinwebscraper-1.1.1/.dockerignore +19 -0
- linkedinwebscraper-1.1.1/Dockerfile +24 -0
- linkedinwebscraper-1.1.1/LICENSE +21 -0
- linkedinwebscraper-1.1.1/MANIFEST.in +6 -0
- linkedinwebscraper-1.1.1/PKG-INFO +155 -0
- linkedinwebscraper-1.1.1/README.md +107 -0
- linkedinwebscraper-1.1.1/examples/example.py +15 -0
- linkedinwebscraper-1.1.1/examples/example_advanced_config.py +38 -0
- linkedinwebscraper-1.1.1/examples/example_openai.py +21 -0
- linkedinwebscraper-1.1.1/pyproject.toml +134 -0
- linkedinwebscraper-1.1.1/runtime.example.toml +35 -0
- linkedinwebscraper-1.1.1/setup.cfg +4 -0
- linkedinwebscraper-1.1.1/setup.py +3 -0
- linkedinwebscraper-1.1.1/src/LinkedInWebScraper.egg-info/PKG-INFO +155 -0
- linkedinwebscraper-1.1.1/src/LinkedInWebScraper.egg-info/SOURCES.txt +82 -0
- linkedinwebscraper-1.1.1/src/LinkedInWebScraper.egg-info/dependency_links.txt +1 -0
- linkedinwebscraper-1.1.1/src/LinkedInWebScraper.egg-info/entry_points.txt +2 -0
- linkedinwebscraper-1.1.1/src/LinkedInWebScraper.egg-info/requires.txt +28 -0
- linkedinwebscraper-1.1.1/src/LinkedInWebScraper.egg-info/top_level.txt +1 -0
- linkedinwebscraper-1.1.1/src/linkedin_web_scraper/__init__.py +108 -0
- linkedinwebscraper-1.1.1/src/linkedin_web_scraper/application/__init__.py +24 -0
- linkedinwebscraper-1.1.1/src/linkedin_web_scraper/application/daily_scrape_service.py +203 -0
- linkedinwebscraper-1.1.1/src/linkedin_web_scraper/application/linkedin_job_scraper.py +203 -0
- linkedinwebscraper-1.1.1/src/linkedin_web_scraper/application/release_manager.py +138 -0
- linkedinwebscraper-1.1.1/src/linkedin_web_scraper/application/runtime_runner.py +155 -0
- linkedinwebscraper-1.1.1/src/linkedin_web_scraper/application/storage.py +56 -0
- linkedinwebscraper-1.1.1/src/linkedin_web_scraper/config/__init__.py +53 -0
- linkedinwebscraper-1.1.1/src/linkedin_web_scraper/config/constants.py +17 -0
- linkedinwebscraper-1.1.1/src/linkedin_web_scraper/config/job_scraper_advanced_config.py +44 -0
- linkedinwebscraper-1.1.1/src/linkedin_web_scraper/config/job_scraper_config.py +53 -0
- linkedinwebscraper-1.1.1/src/linkedin_web_scraper/config/job_scraper_config_factory.py +38 -0
- linkedinwebscraper-1.1.1/src/linkedin_web_scraper/config/keywords.py +21 -0
- linkedinwebscraper-1.1.1/src/linkedin_web_scraper/config/locations.py +47 -0
- linkedinwebscraper-1.1.1/src/linkedin_web_scraper/config/openai.py +5 -0
- linkedinwebscraper-1.1.1/src/linkedin_web_scraper/config/options.py +24 -0
- linkedinwebscraper-1.1.1/src/linkedin_web_scraper/config/runtime.py +288 -0
- linkedinwebscraper-1.1.1/src/linkedin_web_scraper/config/storage.py +21 -0
- linkedinwebscraper-1.1.1/src/linkedin_web_scraper/config/tech_stack.py +208 -0
- linkedinwebscraper-1.1.1/src/linkedin_web_scraper/config/time_filters.py +5 -0
- linkedinwebscraper-1.1.1/src/linkedin_web_scraper/config/user_agents.py +21 -0
- linkedinwebscraper-1.1.1/src/linkedin_web_scraper/domain/__init__.py +6 -0
- linkedinwebscraper-1.1.1/src/linkedin_web_scraper/domain/job_data_cleaner.py +364 -0
- linkedinwebscraper-1.1.1/src/linkedin_web_scraper/domain/job_title_classifier.py +66 -0
- linkedinwebscraper-1.1.1/src/linkedin_web_scraper/infra/__init__.py +22 -0
- linkedinwebscraper-1.1.1/src/linkedin_web_scraper/infra/http/__init__.py +7 -0
- linkedinwebscraper-1.1.1/src/linkedin_web_scraper/infra/http/job_scraper.py +252 -0
- linkedinwebscraper-1.1.1/src/linkedin_web_scraper/infra/http/policy.py +75 -0
- linkedinwebscraper-1.1.1/src/linkedin_web_scraper/infra/http/utils.py +101 -0
- linkedinwebscraper-1.1.1/src/linkedin_web_scraper/infra/logging.py +119 -0
- linkedinwebscraper-1.1.1/src/linkedin_web_scraper/infra/openai/__init__.py +23 -0
- linkedinwebscraper-1.1.1/src/linkedin_web_scraper/infra/openai/job_description_processor.py +94 -0
- linkedinwebscraper-1.1.1/src/linkedin_web_scraper/infra/openai/models.py +78 -0
- linkedinwebscraper-1.1.1/src/linkedin_web_scraper/infra/openai/openai_handler.py +190 -0
- linkedinwebscraper-1.1.1/src/linkedin_web_scraper/infra/paths.py +63 -0
- linkedinwebscraper-1.1.1/src/linkedin_web_scraper/infra/storage/__init__.py +6 -0
- linkedinwebscraper-1.1.1/src/linkedin_web_scraper/infra/storage/file_manager.py +87 -0
- linkedinwebscraper-1.1.1/src/linkedin_web_scraper/infra/storage/models.py +110 -0
- linkedinwebscraper-1.1.1/src/linkedin_web_scraper/infra/storage/sqlite.py +271 -0
- linkedinwebscraper-1.1.1/src/linkedin_web_scraper/interfaces/__init__.py +3 -0
- linkedinwebscraper-1.1.1/src/linkedin_web_scraper/interfaces/cli/__init__.py +5 -0
- linkedinwebscraper-1.1.1/src/linkedin_web_scraper/interfaces/cli/main.py +197 -0
- linkedinwebscraper-1.1.1/tests/test_cli_main.py +96 -0
- linkedinwebscraper-1.1.1/tests/test_config_models.py +77 -0
- linkedinwebscraper-1.1.1/tests/test_daily_scrape_service.py +213 -0
- linkedinwebscraper-1.1.1/tests/test_example_advanced_config_smoke.py +58 -0
- linkedinwebscraper-1.1.1/tests/test_example_openai_smoke.py +61 -0
- linkedinwebscraper-1.1.1/tests/test_example_smoke.py +51 -0
- linkedinwebscraper-1.1.1/tests/test_file_manager.py +147 -0
- linkedinwebscraper-1.1.1/tests/test_github_workflows.py +101 -0
- linkedinwebscraper-1.1.1/tests/test_http_policy.py +201 -0
- linkedinwebscraper-1.1.1/tests/test_internal_codex_docs.py +34 -0
- linkedinwebscraper-1.1.1/tests/test_job_data_cleaner_transformations.py +94 -0
- linkedinwebscraper-1.1.1/tests/test_job_scraper_fixture.py +117 -0
- linkedinwebscraper-1.1.1/tests/test_linkedin_job_scraper.py +244 -0
- linkedinwebscraper-1.1.1/tests/test_location_normalization.py +72 -0
- linkedinwebscraper-1.1.1/tests/test_logging.py +74 -0
- linkedinwebscraper-1.1.1/tests/test_main_smoke.py +29 -0
- linkedinwebscraper-1.1.1/tests/test_openai_enrichment.py +157 -0
- linkedinwebscraper-1.1.1/tests/test_paths.py +69 -0
- linkedinwebscraper-1.1.1/tests/test_process_ds_jobs_smoke.py +72 -0
- linkedinwebscraper-1.1.1/tests/test_release_manager.py +76 -0
- linkedinwebscraper-1.1.1/tests/test_runtime_config.py +116 -0
- linkedinwebscraper-1.1.1/tests/test_runtime_runner.py +128 -0
- linkedinwebscraper-1.1.1/tests/test_sqlite_scrape_storage.py +155 -0
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
FROM python:3.12-slim
|
|
2
|
+
|
|
3
|
+
ENV PYTHONDONTWRITEBYTECODE=1 \
|
|
4
|
+
PYTHONUNBUFFERED=1
|
|
5
|
+
|
|
6
|
+
WORKDIR /app
|
|
7
|
+
|
|
8
|
+
ARG INSTALL_EXTRAS=""
|
|
9
|
+
|
|
10
|
+
RUN apt-get update \
|
|
11
|
+
&& apt-get install -y --no-install-recommends build-essential \
|
|
12
|
+
&& rm -rf /var/lib/apt/lists/*
|
|
13
|
+
|
|
14
|
+
COPY LICENSE README.md MANIFEST.in pyproject.toml setup.py ./
|
|
15
|
+
COPY src ./src
|
|
16
|
+
COPY main.py process_ds_jobs.py runtime.example.toml ./
|
|
17
|
+
|
|
18
|
+
RUN python -m pip install --upgrade pip \
|
|
19
|
+
&& if [ -n "$INSTALL_EXTRAS" ]; then pip install ".[$INSTALL_EXTRAS]"; else pip install .; fi
|
|
20
|
+
|
|
21
|
+
VOLUME ["/app/artifacts"]
|
|
22
|
+
|
|
23
|
+
ENTRYPOINT ["linkedin-webscraper"]
|
|
24
|
+
CMD ["scrape", "daily"]
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Ricardo García Ramírez
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: LinkedInWebScraper
|
|
3
|
+
Version: 1.1.1
|
|
4
|
+
Summary: A library for scraping LinkedIn job postings.
|
|
5
|
+
Author-email: Ricardo Garcia Ramirez <rgr.5882@gmail.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/ricardogr07/LinkedInWebScraper
|
|
8
|
+
Project-URL: Source, https://github.com/ricardogr07/LinkedInWebScraper
|
|
9
|
+
Keywords: linkedin,scraper,jobs,openai,pandas,sqlite
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
17
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
18
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
19
|
+
Requires-Python: >=3.11
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
License-File: LICENSE
|
|
22
|
+
Requires-Dist: beautifulsoup4>=4.12.3
|
|
23
|
+
Requires-Dist: numpy>=1.26.4
|
|
24
|
+
Requires-Dist: pandas>=2.2.0
|
|
25
|
+
Requires-Dist: requests>=2.31.0
|
|
26
|
+
Requires-Dist: SQLAlchemy>=2.0.36
|
|
27
|
+
Provides-Extra: openai
|
|
28
|
+
Requires-Dist: openai>=2.29.0; extra == "openai"
|
|
29
|
+
Requires-Dist: pydantic>=2.7.0; extra == "openai"
|
|
30
|
+
Provides-Extra: dev
|
|
31
|
+
Requires-Dist: build>=1.2.2; extra == "dev"
|
|
32
|
+
Requires-Dist: coverage[toml]>=7.6.0; extra == "dev"
|
|
33
|
+
Requires-Dist: mkdocs>=1.6.1; extra == "dev"
|
|
34
|
+
Requires-Dist: mkdocs-material>=9.5.34; extra == "dev"
|
|
35
|
+
Requires-Dist: mkdocstrings[python]>=0.26.1; extra == "dev"
|
|
36
|
+
Requires-Dist: openai>=2.29.0; extra == "dev"
|
|
37
|
+
Requires-Dist: pydantic>=2.7.0; extra == "dev"
|
|
38
|
+
Requires-Dist: pytest>=8.3.2; extra == "dev"
|
|
39
|
+
Requires-Dist: pytest-cov>=5.0.0; extra == "dev"
|
|
40
|
+
Requires-Dist: pyrefly>=0.26.0; extra == "dev"
|
|
41
|
+
Requires-Dist: ruff>=0.6.9; extra == "dev"
|
|
42
|
+
Requires-Dist: tox>=4.21.2; extra == "dev"
|
|
43
|
+
Provides-Extra: docs
|
|
44
|
+
Requires-Dist: mkdocs>=1.6.1; extra == "docs"
|
|
45
|
+
Requires-Dist: mkdocs-material>=9.5.34; extra == "docs"
|
|
46
|
+
Requires-Dist: mkdocstrings[python]>=0.26.1; extra == "docs"
|
|
47
|
+
Dynamic: license-file
|
|
48
|
+
|
|
49
|
+
# LinkedInWebScraper
|
|
50
|
+
|
|
51
|
+
[](https://github.com/ricardogr07/LinkedInWebScraper/actions/workflows/ci.yml)
|
|
52
|
+
[](https://github.com/ricardogr07/LinkedInWebScraper/actions/workflows/docs.yml)
|
|
53
|
+
[](https://ricardogr07.github.io/LinkedInWebScraper/)
|
|
54
|
+
[](https://github.com/ricardogr07/LinkedInWebScraper/actions/workflows/release.yml)
|
|
55
|
+
[](https://pypi.org/project/LinkedInWebScraper/)
|
|
56
|
+
[](https://pypi.org/project/LinkedInWebScraper/)
|
|
57
|
+
[](https://github.com/ricardogr07/LinkedInWebScraper/blob/main/LICENSE)
|
|
58
|
+
|
|
59
|
+
LinkedInWebScraper is a production-minded Python library and scheduled job runner for collecting LinkedIn job listings, normalizing the data, persisting run history, and exporting reusable datasets.
|
|
60
|
+
|
|
61
|
+
## Highlights
|
|
62
|
+
|
|
63
|
+
- Canonical package namespace under `linkedin_web_scraper`
|
|
64
|
+
- Typed programmatic config for single scrapes and TOML runtime config for CLI and scheduled runs
|
|
65
|
+
- Managed artifacts under `artifacts/jobs`, `artifacts/logs`, and `artifacts/state`
|
|
66
|
+
- SQLite-backed persistence through a clean application storage port
|
|
67
|
+
- Package CLI with `scrape once`, `scrape daily`, `export`, and `--dry-run`
|
|
68
|
+
- Optional OpenAI enrichment built on the current Responses API
|
|
69
|
+
- Runnable examples under `examples/`
|
|
70
|
+
- Auto release automation that waits for green CI and Docs runs on `main`
|
|
71
|
+
|
|
72
|
+
## Install
|
|
73
|
+
|
|
74
|
+
```bash
|
|
75
|
+
pip install LinkedInWebScraper
|
|
76
|
+
pip install LinkedInWebScraper[openai]
|
|
77
|
+
pip install -e .[dev]
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## Quickstart
|
|
81
|
+
|
|
82
|
+
```python
|
|
83
|
+
from linkedin_web_scraper import (
|
|
84
|
+
JobScraperConfig,
|
|
85
|
+
LinkedInJobScraper,
|
|
86
|
+
RemoteType,
|
|
87
|
+
configure_logging,
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
logger = configure_logging(filename="example.log")
|
|
91
|
+
config = JobScraperConfig(
|
|
92
|
+
position="Data Analyst",
|
|
93
|
+
location="San Francisco",
|
|
94
|
+
remote=RemoteType.REMOTE,
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
jobs = LinkedInJobScraper(logger=logger, config=config).run()
|
|
98
|
+
print(jobs.head())
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
## Examples
|
|
102
|
+
|
|
103
|
+
Run the example scripts from `examples/`:
|
|
104
|
+
|
|
105
|
+
```bash
|
|
106
|
+
python examples/example.py
|
|
107
|
+
python examples/example_advanced_config.py
|
|
108
|
+
python examples/example_openai.py
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
The OpenAI example requires `OPENAI_API_KEY` in the environment.
|
|
112
|
+
|
|
113
|
+
## CLI Runtime
|
|
114
|
+
|
|
115
|
+
```bash
|
|
116
|
+
linkedin-webscraper scrape once --dry-run
|
|
117
|
+
linkedin-webscraper scrape daily
|
|
118
|
+
linkedin-webscraper export --run-id <run-id>
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
Use `runtime.example.toml` as the template for a real `runtime.toml`. The root runtime scripts remain available for the daily and once workflows:
|
|
122
|
+
|
|
123
|
+
```bash
|
|
124
|
+
python main.py
|
|
125
|
+
python process_ds_jobs.py
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
## Docs
|
|
129
|
+
|
|
130
|
+
- [Getting Started](docs/getting-started.md)
|
|
131
|
+
- [Configuration](docs/configuration.md)
|
|
132
|
+
- [Runtime and Deployment](docs/runtime.md)
|
|
133
|
+
- [Release and Automation](docs/development/release-and-automation.md)
|
|
134
|
+
- [Validation](docs/development/validation.md)
|
|
135
|
+
- [API Reference](docs/api.md)
|
|
136
|
+
|
|
137
|
+
## Development
|
|
138
|
+
|
|
139
|
+
Run the local gate before risky pushes or merges:
|
|
140
|
+
|
|
141
|
+
```bash
|
|
142
|
+
python -m tox -e preflight
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
For a faster smoke-only path:
|
|
146
|
+
|
|
147
|
+
```bash
|
|
148
|
+
python -m tox -e smoke
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
The detailed validation matrix and release flow live in `docs/development/validation.md` and `docs/development/release-and-automation.md`.
|
|
152
|
+
|
|
153
|
+
## License
|
|
154
|
+
|
|
155
|
+
This project is licensed under the MIT License.
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
# LinkedInWebScraper
|
|
2
|
+
|
|
3
|
+
[](https://github.com/ricardogr07/LinkedInWebScraper/actions/workflows/ci.yml)
|
|
4
|
+
[](https://github.com/ricardogr07/LinkedInWebScraper/actions/workflows/docs.yml)
|
|
5
|
+
[](https://ricardogr07.github.io/LinkedInWebScraper/)
|
|
6
|
+
[](https://github.com/ricardogr07/LinkedInWebScraper/actions/workflows/release.yml)
|
|
7
|
+
[](https://pypi.org/project/LinkedInWebScraper/)
|
|
8
|
+
[](https://pypi.org/project/LinkedInWebScraper/)
|
|
9
|
+
[](https://github.com/ricardogr07/LinkedInWebScraper/blob/main/LICENSE)
|
|
10
|
+
|
|
11
|
+
LinkedInWebScraper is a production-minded Python library and scheduled job runner for collecting LinkedIn job listings, normalizing the data, persisting run history, and exporting reusable datasets.
|
|
12
|
+
|
|
13
|
+
## Highlights
|
|
14
|
+
|
|
15
|
+
- Canonical package namespace under `linkedin_web_scraper`
|
|
16
|
+
- Typed programmatic config for single scrapes and TOML runtime config for CLI and scheduled runs
|
|
17
|
+
- Managed artifacts under `artifacts/jobs`, `artifacts/logs`, and `artifacts/state`
|
|
18
|
+
- SQLite-backed persistence through a clean application storage port
|
|
19
|
+
- Package CLI with `scrape once`, `scrape daily`, `export`, and `--dry-run`
|
|
20
|
+
- Optional OpenAI enrichment built on the current Responses API
|
|
21
|
+
- Runnable examples under `examples/`
|
|
22
|
+
- Auto release automation that waits for green CI and Docs runs on `main`
|
|
23
|
+
|
|
24
|
+
## Install
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
pip install LinkedInWebScraper
|
|
28
|
+
pip install LinkedInWebScraper[openai]
|
|
29
|
+
pip install -e .[dev]
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## Quickstart
|
|
33
|
+
|
|
34
|
+
```python
|
|
35
|
+
from linkedin_web_scraper import (
|
|
36
|
+
JobScraperConfig,
|
|
37
|
+
LinkedInJobScraper,
|
|
38
|
+
RemoteType,
|
|
39
|
+
configure_logging,
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
logger = configure_logging(filename="example.log")
|
|
43
|
+
config = JobScraperConfig(
|
|
44
|
+
position="Data Analyst",
|
|
45
|
+
location="San Francisco",
|
|
46
|
+
remote=RemoteType.REMOTE,
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
jobs = LinkedInJobScraper(logger=logger, config=config).run()
|
|
50
|
+
print(jobs.head())
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## Examples
|
|
54
|
+
|
|
55
|
+
Run the example scripts from `examples/`:
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
python examples/example.py
|
|
59
|
+
python examples/example_advanced_config.py
|
|
60
|
+
python examples/example_openai.py
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
The OpenAI example requires `OPENAI_API_KEY` in the environment.
|
|
64
|
+
|
|
65
|
+
## CLI Runtime
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
linkedin-webscraper scrape once --dry-run
|
|
69
|
+
linkedin-webscraper scrape daily
|
|
70
|
+
linkedin-webscraper export --run-id <run-id>
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
Use `runtime.example.toml` as the template for a real `runtime.toml`. The root runtime scripts remain available for the daily and once workflows:
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
python main.py
|
|
77
|
+
python process_ds_jobs.py
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## Docs
|
|
81
|
+
|
|
82
|
+
- [Getting Started](docs/getting-started.md)
|
|
83
|
+
- [Configuration](docs/configuration.md)
|
|
84
|
+
- [Runtime and Deployment](docs/runtime.md)
|
|
85
|
+
- [Release and Automation](docs/development/release-and-automation.md)
|
|
86
|
+
- [Validation](docs/development/validation.md)
|
|
87
|
+
- [API Reference](docs/api.md)
|
|
88
|
+
|
|
89
|
+
## Development
|
|
90
|
+
|
|
91
|
+
Run the local gate before risky pushes or merges:
|
|
92
|
+
|
|
93
|
+
```bash
|
|
94
|
+
python -m tox -e preflight
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
For a faster smoke-only path:
|
|
98
|
+
|
|
99
|
+
```bash
|
|
100
|
+
python -m tox -e smoke
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
The detailed validation matrix and release flow live in `docs/development/validation.md` and `docs/development/release-and-automation.md`.
|
|
104
|
+
|
|
105
|
+
## License
|
|
106
|
+
|
|
107
|
+
This project is licensed under the MIT License.
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from linkedin_web_scraper import JobScraperConfig, LinkedInJobScraper, Logger
|
|
2
|
+
|
|
3
|
+
logger = Logger("example.log")
|
|
4
|
+
|
|
5
|
+
# Define scraper configuration
|
|
6
|
+
config = JobScraperConfig(position="Data Analyst", location="San Francisco", remote="REMOTE")
|
|
7
|
+
|
|
8
|
+
# Initialize the scraper
|
|
9
|
+
scraper = LinkedInJobScraper(logger, config)
|
|
10
|
+
|
|
11
|
+
# Scrape job data
|
|
12
|
+
job_data = scraper.run()
|
|
13
|
+
|
|
14
|
+
# View the results
|
|
15
|
+
print(job_data.head())
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
from linkedin_web_scraper import (
|
|
2
|
+
JobScraperAdvancedConfig,
|
|
3
|
+
JobScraperConfig,
|
|
4
|
+
LinkedInJobScraper,
|
|
5
|
+
Logger,
|
|
6
|
+
)
|
|
7
|
+
|
|
8
|
+
logger = Logger("example_advanced_config.log")
|
|
9
|
+
|
|
10
|
+
KEYWORDS_LIST = [
|
|
11
|
+
"data",
|
|
12
|
+
"analytics",
|
|
13
|
+
"business intelligence",
|
|
14
|
+
"bi",
|
|
15
|
+
"statistical",
|
|
16
|
+
"statistics",
|
|
17
|
+
"analysis",
|
|
18
|
+
"power bi",
|
|
19
|
+
]
|
|
20
|
+
|
|
21
|
+
advanced_config = JobScraperAdvancedConfig(KEYWORDS=KEYWORDS_LIST)
|
|
22
|
+
|
|
23
|
+
# Define scraper configuration
|
|
24
|
+
config = JobScraperConfig(
|
|
25
|
+
position="Data Analyst",
|
|
26
|
+
location="San Francisco",
|
|
27
|
+
remote="REMOTE",
|
|
28
|
+
advanced_config=advanced_config,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
# Initialize the scraper
|
|
32
|
+
scraper = LinkedInJobScraper(logger, config)
|
|
33
|
+
|
|
34
|
+
# Scrape job data
|
|
35
|
+
job_data = scraper.run()
|
|
36
|
+
|
|
37
|
+
# View the results
|
|
38
|
+
print(job_data.head())
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from linkedin_web_scraper import JobScraperConfig, LinkedInJobScraper, Logger
|
|
2
|
+
|
|
3
|
+
logger = Logger("example_openai.log")
|
|
4
|
+
|
|
5
|
+
# Define scraper configuration
|
|
6
|
+
config = JobScraperConfig(
|
|
7
|
+
position="Data Analyst",
|
|
8
|
+
location="San Francisco",
|
|
9
|
+
remote="REMOTE",
|
|
10
|
+
openai_enabled=True,
|
|
11
|
+
openai_model="gpt-4o-mini",
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
# Initialize the scraper
|
|
15
|
+
scraper = LinkedInJobScraper(logger, config)
|
|
16
|
+
|
|
17
|
+
# Scrape job data
|
|
18
|
+
job_data = scraper.run()
|
|
19
|
+
|
|
20
|
+
# View the results
|
|
21
|
+
print(job_data.head())
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=69", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "LinkedInWebScraper"
|
|
7
|
+
version = "1.1.1"
|
|
8
|
+
description = "A library for scraping LinkedIn job postings."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.11"
|
|
11
|
+
license = "MIT"
|
|
12
|
+
license-files = ["LICENSE"]
|
|
13
|
+
authors = [
|
|
14
|
+
{ name = "Ricardo Garcia Ramirez", email = "rgr.5882@gmail.com" },
|
|
15
|
+
]
|
|
16
|
+
keywords = ["linkedin", "scraper", "jobs", "openai", "pandas", "sqlite"]
|
|
17
|
+
classifiers = [
|
|
18
|
+
"Development Status :: 3 - Alpha",
|
|
19
|
+
"Intended Audience :: Developers",
|
|
20
|
+
"Programming Language :: Python :: 3",
|
|
21
|
+
"Programming Language :: Python :: 3.11",
|
|
22
|
+
"Programming Language :: Python :: 3.12",
|
|
23
|
+
"Programming Language :: Python :: 3.13",
|
|
24
|
+
"Programming Language :: Python :: 3.14",
|
|
25
|
+
"Topic :: Internet :: WWW/HTTP",
|
|
26
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
27
|
+
]
|
|
28
|
+
dependencies = [
|
|
29
|
+
"beautifulsoup4>=4.12.3",
|
|
30
|
+
"numpy>=1.26.4",
|
|
31
|
+
"pandas>=2.2.0",
|
|
32
|
+
"requests>=2.31.0",
|
|
33
|
+
"SQLAlchemy>=2.0.36",
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
[project.optional-dependencies]
|
|
37
|
+
openai = [
|
|
38
|
+
"openai>=2.29.0",
|
|
39
|
+
"pydantic>=2.7.0",
|
|
40
|
+
]
|
|
41
|
+
dev = [
|
|
42
|
+
"build>=1.2.2",
|
|
43
|
+
"coverage[toml]>=7.6.0",
|
|
44
|
+
"mkdocs>=1.6.1",
|
|
45
|
+
"mkdocs-material>=9.5.34",
|
|
46
|
+
"mkdocstrings[python]>=0.26.1",
|
|
47
|
+
"openai>=2.29.0",
|
|
48
|
+
"pydantic>=2.7.0",
|
|
49
|
+
"pytest>=8.3.2",
|
|
50
|
+
"pytest-cov>=5.0.0",
|
|
51
|
+
"pyrefly>=0.26.0",
|
|
52
|
+
"ruff>=0.6.9",
|
|
53
|
+
"tox>=4.21.2",
|
|
54
|
+
]
|
|
55
|
+
docs = [
|
|
56
|
+
"mkdocs>=1.6.1",
|
|
57
|
+
"mkdocs-material>=9.5.34",
|
|
58
|
+
"mkdocstrings[python]>=0.26.1",
|
|
59
|
+
]
|
|
60
|
+
|
|
61
|
+
[project.urls]
|
|
62
|
+
Homepage = "https://github.com/ricardogr07/LinkedInWebScraper"
|
|
63
|
+
Source = "https://github.com/ricardogr07/LinkedInWebScraper"
|
|
64
|
+
|
|
65
|
+
[project.scripts]
|
|
66
|
+
linkedin-webscraper = "linkedin_web_scraper.interfaces.cli.main:main"
|
|
67
|
+
|
|
68
|
+
[tool.setuptools]
|
|
69
|
+
include-package-data = true
|
|
70
|
+
package-dir = { "" = "src" }
|
|
71
|
+
|
|
72
|
+
[tool.setuptools.packages.find]
|
|
73
|
+
where = ["src"]
|
|
74
|
+
include = ["linkedin_web_scraper*"]
|
|
75
|
+
|
|
76
|
+
[tool.pytest.ini_options]
|
|
77
|
+
addopts = "-ra -p no:cacheprovider"
|
|
78
|
+
testpaths = ["tests"]
|
|
79
|
+
markers = [
|
|
80
|
+
"live_linkedin: tests that hit LinkedIn or related live HTTP endpoints",
|
|
81
|
+
"live_openai: tests that hit the OpenAI API",
|
|
82
|
+
]
|
|
83
|
+
|
|
84
|
+
[tool.ruff]
|
|
85
|
+
line-length = 100
|
|
86
|
+
target-version = "py311"
|
|
87
|
+
src = ["src", "."]
|
|
88
|
+
extend-exclude = ["site", ".tmp", "pytest-cache-files-*"]
|
|
89
|
+
|
|
90
|
+
[tool.ruff.format]
|
|
91
|
+
quote-style = "double"
|
|
92
|
+
indent-style = "space"
|
|
93
|
+
|
|
94
|
+
[tool.ruff.lint]
|
|
95
|
+
select = ["E4", "E7", "E9", "F", "I", "UP", "B", "SIM"]
|
|
96
|
+
|
|
97
|
+
[tool.coverage.run]
|
|
98
|
+
branch = true
|
|
99
|
+
source = ["src"]
|
|
100
|
+
|
|
101
|
+
[tool.coverage.report]
|
|
102
|
+
show_missing = true
|
|
103
|
+
skip_covered = false
|
|
104
|
+
|
|
105
|
+
[tool.pyrefly]
|
|
106
|
+
project-includes = [
|
|
107
|
+
"src/linkedin_web_scraper/config/job_scraper_config.py",
|
|
108
|
+
"src/linkedin_web_scraper/config/job_scraper_advanced_config.py",
|
|
109
|
+
"src/linkedin_web_scraper/config/job_scraper_config_factory.py",
|
|
110
|
+
"src/linkedin_web_scraper/config/openai.py",
|
|
111
|
+
"src/linkedin_web_scraper/config/storage.py",
|
|
112
|
+
"src/linkedin_web_scraper/config/options.py",
|
|
113
|
+
"src/linkedin_web_scraper/config/runtime.py",
|
|
114
|
+
"src/linkedin_web_scraper/application/daily_scrape_service.py",
|
|
115
|
+
"src/linkedin_web_scraper/application/linkedin_job_scraper.py",
|
|
116
|
+
"src/linkedin_web_scraper/application/storage.py",
|
|
117
|
+
"src/linkedin_web_scraper/application/runtime_runner.py",
|
|
118
|
+
"src/linkedin_web_scraper/application/release_manager.py",
|
|
119
|
+
"src/linkedin_web_scraper/domain/job_data_cleaner.py",
|
|
120
|
+
"src/linkedin_web_scraper/domain/job_title_classifier.py",
|
|
121
|
+
"src/linkedin_web_scraper/infra/logging.py",
|
|
122
|
+
"src/linkedin_web_scraper/infra/paths.py",
|
|
123
|
+
"src/linkedin_web_scraper/infra/http/policy.py",
|
|
124
|
+
"src/linkedin_web_scraper/infra/http/utils.py",
|
|
125
|
+
"src/linkedin_web_scraper/infra/http/job_scraper.py",
|
|
126
|
+
"src/linkedin_web_scraper/infra/openai/models.py",
|
|
127
|
+
"src/linkedin_web_scraper/infra/openai/openai_handler.py",
|
|
128
|
+
"src/linkedin_web_scraper/infra/openai/job_description_processor.py",
|
|
129
|
+
"src/linkedin_web_scraper/infra/storage/models.py",
|
|
130
|
+
"src/linkedin_web_scraper/infra/storage/sqlite.py",
|
|
131
|
+
]
|
|
132
|
+
search-path = ["src"]
|
|
133
|
+
python-version = "3.11"
|
|
134
|
+
use-ignore-files = true
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# Example runtime configuration for CLI, containers, and scheduled runs.
|
|
2
|
+
# Keep OPENAI_API_KEY in the environment, never in this file.
|
|
3
|
+
|
|
4
|
+
[logging]
|
|
5
|
+
level = "INFO"
|
|
6
|
+
file_name = "main.log"
|
|
7
|
+
|
|
8
|
+
[storage]
|
|
9
|
+
file_name = "linkedin_jobs.sqlite"
|
|
10
|
+
state_dir = "artifacts/state"
|
|
11
|
+
|
|
12
|
+
[scrape.once]
|
|
13
|
+
position = "Data Scientist"
|
|
14
|
+
location = "Monterrey"
|
|
15
|
+
openai_enabled = false
|
|
16
|
+
openai_model = "gpt-4o-mini"
|
|
17
|
+
time_posted = "DAY"
|
|
18
|
+
remote_types = ["REMOTE", "HYBRID", "ON-SITE"]
|
|
19
|
+
file_name = "LinkedIn_Jobs_Data_Scientist_Monterrey.csv"
|
|
20
|
+
output_dir = "artifacts/jobs"
|
|
21
|
+
append = true
|
|
22
|
+
|
|
23
|
+
[scrape.daily]
|
|
24
|
+
cities = ["Monterrey", "Guadalajara", "Mexico City"]
|
|
25
|
+
position = "Data Scientist"
|
|
26
|
+
openai_enabled = false
|
|
27
|
+
openai_model = "gpt-4o-mini"
|
|
28
|
+
time_posted = "DAY"
|
|
29
|
+
output_dir = "artifacts/jobs"
|
|
30
|
+
combined_file_name = "LinkedIn_Jobs_Data_Scientist_Mexico.csv"
|
|
31
|
+
|
|
32
|
+
[export]
|
|
33
|
+
run_id = ""
|
|
34
|
+
file_name = "linkedin_jobs_export.csv"
|
|
35
|
+
output_dir = "artifacts/jobs"
|