vacancies-parser-kit 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vacancies_parser_kit-0.1.0/.gitignore +10 -0
- vacancies_parser_kit-0.1.0/CLAUDE.md +88 -0
- vacancies_parser_kit-0.1.0/PKG-INFO +28 -0
- vacancies_parser_kit-0.1.0/pyproject.toml +43 -0
- vacancies_parser_kit-0.1.0/setup.cfg +4 -0
- vacancies_parser_kit-0.1.0/tests/__init__.py +0 -0
- vacancies_parser_kit-0.1.0/tests/test_headhunter.py +121 -0
- vacancies_parser_kit-0.1.0/tests/test_selector.py +52 -0
- vacancies_parser_kit-0.1.0/vacancies_parser_kit.egg-info/PKG-INFO +28 -0
- vacancies_parser_kit-0.1.0/vacancies_parser_kit.egg-info/SOURCES.txt +43 -0
- vacancies_parser_kit-0.1.0/vacancies_parser_kit.egg-info/dependency_links.txt +1 -0
- vacancies_parser_kit-0.1.0/vacancies_parser_kit.egg-info/entry_points.txt +2 -0
- vacancies_parser_kit-0.1.0/vacancies_parser_kit.egg-info/requires.txt +29 -0
- vacancies_parser_kit-0.1.0/vacancies_parser_kit.egg-info/top_level.txt +1 -0
- vacancies_parser_kit-0.1.0/vpr/__init__.py +3 -0
- vacancies_parser_kit-0.1.0/vpr/cli/__init__.py +0 -0
- vacancies_parser_kit-0.1.0/vpr/cli/commands/__init__.py +0 -0
- vacancies_parser_kit-0.1.0/vpr/cli/commands/init.py +29 -0
- vacancies_parser_kit-0.1.0/vpr/cli/commands/list.py +59 -0
- vacancies_parser_kit-0.1.0/vpr/cli/commands/run.py +92 -0
- vacancies_parser_kit-0.1.0/vpr/cli/main.py +19 -0
- vacancies_parser_kit-0.1.0/vpr/config/__init__.py +0 -0
- vacancies_parser_kit-0.1.0/vpr/config/profile.py +50 -0
- vacancies_parser_kit-0.1.0/vpr/config/project.py +41 -0
- vacancies_parser_kit-0.1.0/vpr/config/source_config.py +46 -0
- vacancies_parser_kit-0.1.0/vpr/core/__init__.py +0 -0
- vacancies_parser_kit-0.1.0/vpr/core/models.py +66 -0
- vacancies_parser_kit-0.1.0/vpr/core/pipeline.py +59 -0
- vacancies_parser_kit-0.1.0/vpr/core/selector.py +35 -0
- vacancies_parser_kit-0.1.0/vpr/sources/__init__.py +3 -0
- vacancies_parser_kit-0.1.0/vpr/sources/base.py +28 -0
- vacancies_parser_kit-0.1.0/vpr/sources/headhunter.py +254 -0
- vacancies_parser_kit-0.1.0/vpr/sources/hh_dictionaries.py +82 -0
- vacancies_parser_kit-0.1.0/vpr/sources/registry.py +27 -0
- vacancies_parser_kit-0.1.0/vpr/templates/__init__.py +3 -0
- vacancies_parser_kit-0.1.0/vpr/templates/scaffold.py +99 -0
- vacancies_parser_kit-0.1.0/vpr/transforms/__init__.py +0 -0
- vacancies_parser_kit-0.1.0/vpr/transforms/normalize.py +18 -0
- vacancies_parser_kit-0.1.0/vpr/writers/__init__.py +15 -0
- vacancies_parser_kit-0.1.0/vpr/writers/base.py +25 -0
- vacancies_parser_kit-0.1.0/vpr/writers/clickhouse.py +108 -0
- vacancies_parser_kit-0.1.0/vpr/writers/jsonl.py +28 -0
- vacancies_parser_kit-0.1.0/vpr/writers/postgres.py +132 -0
- vacancies_parser_kit-0.1.0/vpr/writers/registry.py +27 -0
- vacancies_parser_kit-0.1.0/vpr/writers/sqlite.py +114 -0
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
# CLAUDE.md
|
|
2
|
+
|
|
3
|
+
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
|
|
4
|
+
|
|
5
|
+
## Project Overview
|
|
6
|
+
|
|
7
|
+
Multi-source vacancy/job-listing parser. Collects vacancies from heterogeneous sources (Telegram channels, HeadHunter API, LinkedIn, and other job platforms), normalises them into a unified schema, and writes them to pluggable storage backends.
|
|
8
|
+
|
|
9
|
+
CLI tool: **`vpr`** (vacancy parser). Design follows dbt/detectkit patterns — YAML-based source definitions, profiles, tag-based selection.
|
|
10
|
+
|
|
11
|
+
## Key Design Decisions
|
|
12
|
+
|
|
13
|
+
- **Source-agnostic ingestion**: each source is a plugin that yields raw vacancy data. Sources may return unstructured or incomplete data — that's expected and must be handled gracefully (missing fields are nullable, never silently dropped).
|
|
14
|
+
- **Source instances in YAML**: each source is defined as a `.yml` file in `sources/`. Same source type (e.g. `headhunter`) can have multiple instances with different params.
|
|
15
|
+
- **Unified vacancy model**: all sources map to one canonical `Vacancy` record (`vpr/core/models.py`). Composite identity: `(source, source_id)`. Records support CRUD lifecycle states.
|
|
16
|
+
- **Per-source dictionaries**: sources may ship lookup/mapping dictionaries (e.g. HeadHunter area codes → city names) used during normalisation.
|
|
17
|
+
- **Pluggable storage**: writers are independent of sources. Targets defined in `profiles.yml`. Primary: ClickHouse, PostgreSQL.
|
|
18
|
+
- **No built-in scheduler**: execution via bash/cron/Prefect. CLI supports `--select` and tags for orchestration.
|
|
19
|
+
|
|
20
|
+
## Architecture
|
|
21
|
+
|
|
22
|
+
```
|
|
23
|
+
vpr/
|
|
24
|
+
├── cli/ – Click CLI: vpr init / run / list
|
|
25
|
+
│ └── commands/ – one module per command
|
|
26
|
+
├── config/ – YAML config loaders (project, profile, source)
|
|
27
|
+
├── core/
|
|
28
|
+
│ ├── models.py – canonical Vacancy schema
|
|
29
|
+
│ ├── selector.py – --select / --exclude resolution (name, glob, tag:)
|
|
30
|
+
│ └── pipeline.py – fetch → transform → write orchestration
|
|
31
|
+
├── sources/
|
|
32
|
+
│ ├── base.py – BaseSource ABC (async fetch → Vacancy iterator)
|
|
33
|
+
│ └── registry.py – @register_source decorator + lookup
|
|
34
|
+
├── transforms/
|
|
35
|
+
│ └── normalize.py – raw → canonical normalisation
|
|
36
|
+
├── writers/
|
|
37
|
+
│ ├── base.py – BaseWriter ABC (async write)
|
|
38
|
+
│ └── registry.py – @register_writer decorator + lookup
|
|
39
|
+
└── templates/ – scaffold for `vpr init`
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## Development
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
python3 -m venv .venv && source .venv/bin/activate
|
|
46
|
+
pip install -e ".[dev]" # editable install with dev deps
|
|
47
|
+
pip install -e ".[all]" # all source/writer extras
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
pytest # run all tests
|
|
52
|
+
pytest tests/test_selector.py # single test file
|
|
53
|
+
pytest -k "test_tag" # by test name pattern
|
|
54
|
+
ruff check vpr/ # lint
|
|
55
|
+
ruff format vpr/ # format
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## CLI Usage
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
vpr init <project-name> # scaffold a new project
|
|
62
|
+
vpr run --select <name|glob|tag:X> # run pipeline
|
|
63
|
+
vpr run --select tag:hh --exclude hh_test # with exclusion
|
|
64
|
+
vpr run --select "tg_*" --steps fetch # only fetch step
|
|
65
|
+
vpr list # table of all sources
|
|
66
|
+
vpr list --tags # grouped by tag
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## Adding a New Source
|
|
70
|
+
|
|
71
|
+
1. Create `vpr/sources/<type>.py`, subclass `BaseSource`, set `source_type = "<type>"`
|
|
72
|
+
2. Decorate with `@register_source`
|
|
73
|
+
3. Implement `async def fetch(self) -> AsyncIterator[Vacancy]`
|
|
74
|
+
4. Import the module in `vpr/sources/__init__.py` so the decorator fires
|
|
75
|
+
|
|
76
|
+
## Adding a New Writer
|
|
77
|
+
|
|
78
|
+
1. Create `vpr/writers/<type>.py`, subclass `BaseWriter`, set `writer_type = "<type>"`
|
|
79
|
+
2. Decorate with `@register_writer`
|
|
80
|
+
3. Implement `async def write(self, vacancies: list[Vacancy]) -> int`
|
|
81
|
+
4. Import the module in `vpr/writers/__init__.py`
|
|
82
|
+
|
|
83
|
+
## Conventions
|
|
84
|
+
|
|
85
|
+
- All user-facing text and code comments in Russian are acceptable; code identifiers in English.
|
|
86
|
+
- Configuration via environment variables / `.env` files (never committed). Use `{{ env_var('NAME') }}` in profiles.yml.
|
|
87
|
+
- Each source and writer must be independently testable with a fixture/mock for the external service.
|
|
88
|
+
- Pipeline steps: `fetch`, `transform`, `write` — each can run independently via `--steps`.
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: vacancies-parser-kit
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Multi-source vacancy parser with pluggable storage backends
|
|
5
|
+
License: MIT
|
|
6
|
+
Requires-Python: >=3.11
|
|
7
|
+
Requires-Dist: click>=8.1
|
|
8
|
+
Requires-Dist: pyyaml>=6.0
|
|
9
|
+
Requires-Dist: pydantic>=2.0
|
|
10
|
+
Requires-Dist: jinja2>=3.1
|
|
11
|
+
Requires-Dist: rich>=13.0
|
|
12
|
+
Provides-Extra: hh
|
|
13
|
+
Requires-Dist: httpx>=0.27; extra == "hh"
|
|
14
|
+
Provides-Extra: telegram
|
|
15
|
+
Requires-Dist: telethon>=1.36; extra == "telegram"
|
|
16
|
+
Provides-Extra: linkedin
|
|
17
|
+
Requires-Dist: httpx>=0.27; extra == "linkedin"
|
|
18
|
+
Provides-Extra: clickhouse
|
|
19
|
+
Requires-Dist: clickhouse-connect>=0.7; extra == "clickhouse"
|
|
20
|
+
Provides-Extra: postgres
|
|
21
|
+
Requires-Dist: asyncpg>=0.29; extra == "postgres"
|
|
22
|
+
Requires-Dist: psycopg[binary]>=3.1; extra == "postgres"
|
|
23
|
+
Provides-Extra: all
|
|
24
|
+
Requires-Dist: vpr[clickhouse,hh,linkedin,postgres,telegram]; extra == "all"
|
|
25
|
+
Provides-Extra: dev
|
|
26
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
27
|
+
Requires-Dist: pytest-asyncio>=0.23; extra == "dev"
|
|
28
|
+
Requires-Dist: ruff>=0.4; extra == "dev"
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68.0", "setuptools-scm>=8.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "vacancies-parser-kit"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Multi-source vacancy parser with pluggable storage backends"
|
|
9
|
+
license = {text = "MIT"}
|
|
10
|
+
requires-python = ">=3.11"
|
|
11
|
+
dependencies = [
|
|
12
|
+
"click>=8.1",
|
|
13
|
+
"pyyaml>=6.0",
|
|
14
|
+
"pydantic>=2.0",
|
|
15
|
+
"jinja2>=3.1",
|
|
16
|
+
"rich>=13.0",
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
[project.optional-dependencies]
|
|
20
|
+
hh = ["httpx>=0.27"]
|
|
21
|
+
telegram = ["telethon>=1.36"]
|
|
22
|
+
linkedin = ["httpx>=0.27"]
|
|
23
|
+
clickhouse = ["clickhouse-connect>=0.7"]
|
|
24
|
+
postgres = ["asyncpg>=0.29", "psycopg[binary]>=3.1"]
|
|
25
|
+
all = ["vpr[hh,telegram,linkedin,clickhouse,postgres]"]
|
|
26
|
+
dev = ["pytest>=8.0", "pytest-asyncio>=0.23", "ruff>=0.4"]
|
|
27
|
+
|
|
28
|
+
[project.scripts]
|
|
29
|
+
vpr = "vpr.cli.main:cli"
|
|
30
|
+
|
|
31
|
+
[tool.setuptools.packages.find]
|
|
32
|
+
include = ["vpr*"]
|
|
33
|
+
|
|
34
|
+
[tool.ruff]
|
|
35
|
+
target-version = "py311"
|
|
36
|
+
line-length = 120
|
|
37
|
+
|
|
38
|
+
[tool.ruff.lint]
|
|
39
|
+
select = ["E", "F", "I", "UP"]
|
|
40
|
+
|
|
41
|
+
[tool.pytest.ini_options]
|
|
42
|
+
asyncio_mode = "auto"
|
|
43
|
+
testpaths = ["tests"]
|
|
File without changes
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
"""Tests for HeadHunter source (mocked HTTP)."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from unittest.mock import AsyncMock, patch
|
|
5
|
+
|
|
6
|
+
import pytest
|
|
7
|
+
|
|
8
|
+
from vpr.config.source_config import SourceConfig
|
|
9
|
+
from vpr.sources.headhunter import HeadHunterSource
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _make_config(**params) -> SourceConfig:
|
|
13
|
+
return SourceConfig(name="test_hh", type="headhunter", params=params)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _vacancy_json(vid: str, name: str = "Python Dev") -> dict:
|
|
17
|
+
return {
|
|
18
|
+
"id": vid,
|
|
19
|
+
"name": name,
|
|
20
|
+
"employer": {"name": "TestCorp"},
|
|
21
|
+
"area": {"id": "1", "name": "Москва"},
|
|
22
|
+
"salary": {"from": 100000, "to": 200000, "currency": "RUR", "gross": False},
|
|
23
|
+
"experience": {"id": "between1And3", "name": "1–3 года"},
|
|
24
|
+
"employment": {"id": "full", "name": "Полная занятость"},
|
|
25
|
+
"schedule": {"id": "remote", "name": "Удалённая работа"},
|
|
26
|
+
"published_at": "2026-04-05T10:00:00+0300",
|
|
27
|
+
"alternate_url": "https://hh.ru/vacancy/12345",
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _full_vacancy_json(vid: str) -> dict:
|
|
32
|
+
base = _vacancy_json(vid)
|
|
33
|
+
base["description"] = "<p>We need a Python developer</p>"
|
|
34
|
+
base["key_skills"] = [{"name": "Python"}, {"name": "FastAPI"}]
|
|
35
|
+
return base
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _search_response(items: list[dict], found: int | None = None) -> dict:
|
|
39
|
+
return {
|
|
40
|
+
"found": found or len(items),
|
|
41
|
+
"pages": 1,
|
|
42
|
+
"per_page": 100,
|
|
43
|
+
"page": 0,
|
|
44
|
+
"items": items,
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class FakeResponse:
|
|
49
|
+
def __init__(self, data: dict, status_code: int = 200):
|
|
50
|
+
self._data = data
|
|
51
|
+
self.status_code = status_code
|
|
52
|
+
self.text = json.dumps(data)
|
|
53
|
+
self.content = self.text.encode()
|
|
54
|
+
|
|
55
|
+
def json(self):
|
|
56
|
+
return self._data
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@pytest.mark.asyncio
|
|
60
|
+
async def test_fetch_with_enrich():
|
|
61
|
+
"""Two-pass: search + enrich each vacancy."""
|
|
62
|
+
config = _make_config(search_text="python", enrich=True, request_delay=0)
|
|
63
|
+
source = HeadHunterSource(config)
|
|
64
|
+
|
|
65
|
+
search_resp = FakeResponse(_search_response([_vacancy_json("1"), _vacancy_json("2")]))
|
|
66
|
+
full_1 = FakeResponse(_full_vacancy_json("1"))
|
|
67
|
+
full_2 = FakeResponse(_full_vacancy_json("2"))
|
|
68
|
+
|
|
69
|
+
mock_client = AsyncMock()
|
|
70
|
+
mock_client.get = AsyncMock(side_effect=[search_resp, full_1, full_2])
|
|
71
|
+
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
|
|
72
|
+
mock_client.__aexit__ = AsyncMock(return_value=False)
|
|
73
|
+
|
|
74
|
+
with patch.object(source, "_make_client", return_value=mock_client):
|
|
75
|
+
vacancies = [v async for v in source.fetch()]
|
|
76
|
+
|
|
77
|
+
assert len(vacancies) == 2
|
|
78
|
+
assert vacancies[0].source == "headhunter"
|
|
79
|
+
assert vacancies[0].source_id == "1"
|
|
80
|
+
assert vacancies[0].description == "<p>We need a Python developer</p>"
|
|
81
|
+
assert vacancies[0].skills == ["Python", "FastAPI"]
|
|
82
|
+
assert vacancies[0].salary.min == 100000
|
|
83
|
+
assert vacancies[0].salary.currency == "RUR"
|
|
84
|
+
assert vacancies[0].city == "Москва"
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
@pytest.mark.asyncio
|
|
88
|
+
async def test_fetch_without_enrich():
|
|
89
|
+
"""Single pass: search only, no description/skills."""
|
|
90
|
+
config = _make_config(search_text="python", enrich=False, request_delay=0)
|
|
91
|
+
source = HeadHunterSource(config)
|
|
92
|
+
|
|
93
|
+
search_resp = FakeResponse(_search_response([_vacancy_json("1")]))
|
|
94
|
+
|
|
95
|
+
mock_client = AsyncMock()
|
|
96
|
+
mock_client.get = AsyncMock(return_value=search_resp)
|
|
97
|
+
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
|
|
98
|
+
mock_client.__aexit__ = AsyncMock(return_value=False)
|
|
99
|
+
|
|
100
|
+
with patch.object(source, "_make_client", return_value=mock_client):
|
|
101
|
+
vacancies = [v async for v in source.fetch()]
|
|
102
|
+
|
|
103
|
+
assert len(vacancies) == 1
|
|
104
|
+
assert vacancies[0].description is None
|
|
105
|
+
assert vacancies[0].skills == []
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
@pytest.mark.asyncio
|
|
109
|
+
async def test_vacancy_mapping():
|
|
110
|
+
"""Check all canonical fields are correctly mapped."""
|
|
111
|
+
v = HeadHunterSource._to_vacancy(_full_vacancy_json("42"))
|
|
112
|
+
assert v.source == "headhunter"
|
|
113
|
+
assert v.source_id == "42"
|
|
114
|
+
assert v.title == "Python Dev"
|
|
115
|
+
assert v.company == "TestCorp"
|
|
116
|
+
assert v.experience == "1–3 года"
|
|
117
|
+
assert v.employment_type == "Полная занятость"
|
|
118
|
+
assert v.schedule == "Удалённая работа"
|
|
119
|
+
assert v.url == "https://hh.ru/vacancy/12345"
|
|
120
|
+
assert "Python" in v.skills
|
|
121
|
+
assert v.raw["id"] == "42"
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
"""Tests for source selector logic."""
|
|
2
|
+
|
|
3
|
+
from vpr.config.source_config import SourceConfig
|
|
4
|
+
from vpr.core.selector import select_sources
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def _make(name: str, tags: list[str] | None = None) -> SourceConfig:
|
|
8
|
+
return SourceConfig(name=name, type="test", tags=tags or [])
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
SOURCES = [
|
|
12
|
+
_make("hh_backend", ["hh", "backend"]),
|
|
13
|
+
_make("hh_frontend", ["hh", "frontend"]),
|
|
14
|
+
_make("tg_python_jobs", ["telegram", "python"]),
|
|
15
|
+
_make("tg_devops_jobs", ["telegram", "devops"]),
|
|
16
|
+
_make("linkedin_eu", ["linkedin"]),
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def test_select_by_exact_name():
|
|
21
|
+
result = select_sources(SOURCES, "hh_backend")
|
|
22
|
+
assert [s.name for s in result] == ["hh_backend"]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def test_select_by_glob():
|
|
26
|
+
result = select_sources(SOURCES, "hh_*")
|
|
27
|
+
assert sorted(s.name for s in result) == ["hh_backend", "hh_frontend"]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def test_select_by_tag():
|
|
31
|
+
result = select_sources(SOURCES, "tag:telegram")
|
|
32
|
+
assert sorted(s.name for s in result) == ["tg_devops_jobs", "tg_python_jobs"]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def test_select_all_glob():
|
|
36
|
+
result = select_sources(SOURCES, "*")
|
|
37
|
+
assert len(result) == 5
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def test_exclude():
|
|
41
|
+
result = select_sources(SOURCES, "tag:hh", excludes={"hh_frontend"})
|
|
42
|
+
assert [s.name for s in result] == ["hh_backend"]
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def test_no_match():
|
|
46
|
+
result = select_sources(SOURCES, "nonexistent")
|
|
47
|
+
assert result == []
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def test_exclude_all():
|
|
51
|
+
result = select_sources(SOURCES, "tag:hh", excludes={"hh_backend", "hh_frontend"})
|
|
52
|
+
assert result == []
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: vacancies-parser-kit
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Multi-source vacancy parser with pluggable storage backends
|
|
5
|
+
License: MIT
|
|
6
|
+
Requires-Python: >=3.11
|
|
7
|
+
Requires-Dist: click>=8.1
|
|
8
|
+
Requires-Dist: pyyaml>=6.0
|
|
9
|
+
Requires-Dist: pydantic>=2.0
|
|
10
|
+
Requires-Dist: jinja2>=3.1
|
|
11
|
+
Requires-Dist: rich>=13.0
|
|
12
|
+
Provides-Extra: hh
|
|
13
|
+
Requires-Dist: httpx>=0.27; extra == "hh"
|
|
14
|
+
Provides-Extra: telegram
|
|
15
|
+
Requires-Dist: telethon>=1.36; extra == "telegram"
|
|
16
|
+
Provides-Extra: linkedin
|
|
17
|
+
Requires-Dist: httpx>=0.27; extra == "linkedin"
|
|
18
|
+
Provides-Extra: clickhouse
|
|
19
|
+
Requires-Dist: clickhouse-connect>=0.7; extra == "clickhouse"
|
|
20
|
+
Provides-Extra: postgres
|
|
21
|
+
Requires-Dist: asyncpg>=0.29; extra == "postgres"
|
|
22
|
+
Requires-Dist: psycopg[binary]>=3.1; extra == "postgres"
|
|
23
|
+
Provides-Extra: all
|
|
24
|
+
Requires-Dist: vpr[clickhouse,hh,linkedin,postgres,telegram]; extra == "all"
|
|
25
|
+
Provides-Extra: dev
|
|
26
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
27
|
+
Requires-Dist: pytest-asyncio>=0.23; extra == "dev"
|
|
28
|
+
Requires-Dist: ruff>=0.4; extra == "dev"
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
.gitignore
|
|
2
|
+
CLAUDE.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
tests/__init__.py
|
|
5
|
+
tests/test_headhunter.py
|
|
6
|
+
tests/test_selector.py
|
|
7
|
+
vacancies_parser_kit.egg-info/PKG-INFO
|
|
8
|
+
vacancies_parser_kit.egg-info/SOURCES.txt
|
|
9
|
+
vacancies_parser_kit.egg-info/dependency_links.txt
|
|
10
|
+
vacancies_parser_kit.egg-info/entry_points.txt
|
|
11
|
+
vacancies_parser_kit.egg-info/requires.txt
|
|
12
|
+
vacancies_parser_kit.egg-info/top_level.txt
|
|
13
|
+
vpr/__init__.py
|
|
14
|
+
vpr/cli/__init__.py
|
|
15
|
+
vpr/cli/main.py
|
|
16
|
+
vpr/cli/commands/__init__.py
|
|
17
|
+
vpr/cli/commands/init.py
|
|
18
|
+
vpr/cli/commands/list.py
|
|
19
|
+
vpr/cli/commands/run.py
|
|
20
|
+
vpr/config/__init__.py
|
|
21
|
+
vpr/config/profile.py
|
|
22
|
+
vpr/config/project.py
|
|
23
|
+
vpr/config/source_config.py
|
|
24
|
+
vpr/core/__init__.py
|
|
25
|
+
vpr/core/models.py
|
|
26
|
+
vpr/core/pipeline.py
|
|
27
|
+
vpr/core/selector.py
|
|
28
|
+
vpr/sources/__init__.py
|
|
29
|
+
vpr/sources/base.py
|
|
30
|
+
vpr/sources/headhunter.py
|
|
31
|
+
vpr/sources/hh_dictionaries.py
|
|
32
|
+
vpr/sources/registry.py
|
|
33
|
+
vpr/templates/__init__.py
|
|
34
|
+
vpr/templates/scaffold.py
|
|
35
|
+
vpr/transforms/__init__.py
|
|
36
|
+
vpr/transforms/normalize.py
|
|
37
|
+
vpr/writers/__init__.py
|
|
38
|
+
vpr/writers/base.py
|
|
39
|
+
vpr/writers/clickhouse.py
|
|
40
|
+
vpr/writers/jsonl.py
|
|
41
|
+
vpr/writers/postgres.py
|
|
42
|
+
vpr/writers/registry.py
|
|
43
|
+
vpr/writers/sqlite.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
click>=8.1
|
|
2
|
+
pyyaml>=6.0
|
|
3
|
+
pydantic>=2.0
|
|
4
|
+
jinja2>=3.1
|
|
5
|
+
rich>=13.0
|
|
6
|
+
|
|
7
|
+
[all]
|
|
8
|
+
vpr[clickhouse,hh,linkedin,postgres,telegram]
|
|
9
|
+
|
|
10
|
+
[clickhouse]
|
|
11
|
+
clickhouse-connect>=0.7
|
|
12
|
+
|
|
13
|
+
[dev]
|
|
14
|
+
pytest>=8.0
|
|
15
|
+
pytest-asyncio>=0.23
|
|
16
|
+
ruff>=0.4
|
|
17
|
+
|
|
18
|
+
[hh]
|
|
19
|
+
httpx>=0.27
|
|
20
|
+
|
|
21
|
+
[linkedin]
|
|
22
|
+
httpx>=0.27
|
|
23
|
+
|
|
24
|
+
[postgres]
|
|
25
|
+
asyncpg>=0.29
|
|
26
|
+
psycopg[binary]>=3.1
|
|
27
|
+
|
|
28
|
+
[telegram]
|
|
29
|
+
telethon>=1.36
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
vpr
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""vpr init — scaffold a new project."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
import click
|
|
8
|
+
from rich.console import Console
|
|
9
|
+
|
|
10
|
+
from vpr.templates import render_project_scaffold
|
|
11
|
+
|
|
12
|
+
console = Console()
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@click.command("init")
|
|
16
|
+
@click.argument("project_name")
|
|
17
|
+
def init_cmd(project_name: str) -> None:
|
|
18
|
+
"""Create a new vpr project."""
|
|
19
|
+
target = Path.cwd() / project_name
|
|
20
|
+
if target.exists():
|
|
21
|
+
raise click.ClickException(f"Directory '{project_name}' already exists")
|
|
22
|
+
|
|
23
|
+
render_project_scaffold(target, project_name)
|
|
24
|
+
console.print(f"[green]Project '{project_name}' created at {target}[/green]")
|
|
25
|
+
console.print("Next steps:")
|
|
26
|
+
console.print(f" cd {project_name}")
|
|
27
|
+
console.print(" # edit profiles.yml with your database credentials")
|
|
28
|
+
console.print(" # add source definitions to sources/")
|
|
29
|
+
console.print(" vpr run --select <source_name>")
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""vpr list — show all configured sources."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import click
|
|
6
|
+
from rich.console import Console
|
|
7
|
+
from rich.table import Table
|
|
8
|
+
|
|
9
|
+
from vpr.config.project import find_project_root, load_project_config
|
|
10
|
+
from vpr.config.source_config import load_source_configs
|
|
11
|
+
|
|
12
|
+
console = Console()
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@click.command("list")
|
|
16
|
+
@click.option("--tags", is_flag=True, help="Group sources by tag")
|
|
17
|
+
def list_cmd(tags: bool) -> None:
|
|
18
|
+
"""List all configured sources."""
|
|
19
|
+
project_root = find_project_root()
|
|
20
|
+
project_cfg = load_project_config(project_root)
|
|
21
|
+
sources = load_source_configs(project_root / project_cfg.paths.sources)
|
|
22
|
+
|
|
23
|
+
if not sources:
|
|
24
|
+
console.print("[yellow]No sources found.[/yellow]")
|
|
25
|
+
return
|
|
26
|
+
|
|
27
|
+
if tags:
|
|
28
|
+
_print_by_tags(sources)
|
|
29
|
+
else:
|
|
30
|
+
_print_table(sources)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _print_table(sources) -> None:
|
|
34
|
+
table = Table(title="Sources")
|
|
35
|
+
table.add_column("Name", style="cyan")
|
|
36
|
+
table.add_column("Type", style="green")
|
|
37
|
+
table.add_column("Tags")
|
|
38
|
+
table.add_column("Targets")
|
|
39
|
+
|
|
40
|
+
for src in sorted(sources, key=lambda s: s.name):
|
|
41
|
+
table.add_row(
|
|
42
|
+
src.name,
|
|
43
|
+
src.type,
|
|
44
|
+
", ".join(src.tags),
|
|
45
|
+
", ".join(src.targets) if src.targets else "—",
|
|
46
|
+
)
|
|
47
|
+
console.print(table)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _print_by_tags(sources) -> None:
|
|
51
|
+
tag_map: dict[str, list[str]] = {}
|
|
52
|
+
for src in sources:
|
|
53
|
+
for tag in src.tags:
|
|
54
|
+
tag_map.setdefault(tag, []).append(src.name)
|
|
55
|
+
|
|
56
|
+
for tag in sorted(tag_map):
|
|
57
|
+
console.print(f"[bold]tag:{tag}[/bold]")
|
|
58
|
+
for name in sorted(tag_map[tag]):
|
|
59
|
+
console.print(f" - {name}")
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
"""vpr run — fetch, transform, and write vacancies."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
from typing import TYPE_CHECKING
|
|
7
|
+
|
|
8
|
+
import click
|
|
9
|
+
from rich.console import Console
|
|
10
|
+
|
|
11
|
+
from vpr.config.project import find_project_root, load_project_config
|
|
12
|
+
from vpr.config.profile import load_profiles
|
|
13
|
+
from vpr.config.source_config import load_source_configs
|
|
14
|
+
from vpr.core.selector import select_sources
|
|
15
|
+
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
pass
|
|
18
|
+
|
|
19
|
+
console = Console()
|
|
20
|
+
|
|
21
|
+
VALID_STEPS = ("fetch", "transform", "write")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@click.command("run")
|
|
25
|
+
@click.option("--select", "selector", required=True, help="Source selector: name, glob pattern, or tag:<tag>")
|
|
26
|
+
@click.option("--exclude", "excludes", multiple=True, help="Exclude sources by name")
|
|
27
|
+
@click.option("--steps", default=None, help=f"Comma-separated pipeline steps: {', '.join(VALID_STEPS)}")
|
|
28
|
+
@click.option("--profile", default=None, help="Override default profile from vpr_project.yml")
|
|
29
|
+
@click.option("--full-refresh", is_flag=True, help="Drop existing data and re-fetch from scratch")
|
|
30
|
+
def run_cmd(
|
|
31
|
+
selector: str,
|
|
32
|
+
excludes: tuple[str, ...],
|
|
33
|
+
steps: str | None,
|
|
34
|
+
profile: str | None,
|
|
35
|
+
full_refresh: bool,
|
|
36
|
+
) -> None:
|
|
37
|
+
"""Run the vacancy pipeline for selected sources."""
|
|
38
|
+
active_steps = _parse_steps(steps)
|
|
39
|
+
|
|
40
|
+
project_root = find_project_root()
|
|
41
|
+
project_cfg = load_project_config(project_root)
|
|
42
|
+
profiles_cfg = load_profiles(project_root)
|
|
43
|
+
all_sources = load_source_configs(project_root / project_cfg.paths.sources)
|
|
44
|
+
|
|
45
|
+
if not all_sources:
|
|
46
|
+
raise click.ClickException(f"No source configs found in {project_cfg.paths.sources}/")
|
|
47
|
+
|
|
48
|
+
matched = select_sources(all_sources, selector, excludes=set(excludes))
|
|
49
|
+
if not matched:
|
|
50
|
+
raise click.ClickException(f"No sources matched selector '{selector}'")
|
|
51
|
+
|
|
52
|
+
profile_name = profile or project_cfg.default_profile
|
|
53
|
+
if profile_name not in profiles_cfg.profiles:
|
|
54
|
+
raise click.ClickException(f"Profile '{profile_name}' not found in profiles.yml")
|
|
55
|
+
|
|
56
|
+
console.print(f"[bold]Profile:[/bold] {profile_name}")
|
|
57
|
+
console.print(f"[bold]Steps:[/bold] {', '.join(active_steps)}")
|
|
58
|
+
console.print(f"[bold]Sources ({len(matched)}):[/bold]")
|
|
59
|
+
for src in matched:
|
|
60
|
+
console.print(f" - {src.name} [dim]({src.type})[/dim]")
|
|
61
|
+
console.print()
|
|
62
|
+
|
|
63
|
+
asyncio.run(_run_pipeline(matched, profiles_cfg, profile_name, active_steps, full_refresh))
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
async def _run_pipeline(sources, profiles_cfg, profile_name, steps, full_refresh):
|
|
67
|
+
"""Execute the pipeline for each matched source."""
|
|
68
|
+
from vpr.core.pipeline import run_source_pipeline
|
|
69
|
+
|
|
70
|
+
profile = profiles_cfg.profiles[profile_name]
|
|
71
|
+
for src_cfg in sources:
|
|
72
|
+
console.print(f"[bold cyan]>>> {src_cfg.name}[/bold cyan]")
|
|
73
|
+
try:
|
|
74
|
+
await run_source_pipeline(
|
|
75
|
+
source_config=src_cfg,
|
|
76
|
+
profile=profile,
|
|
77
|
+
steps=steps,
|
|
78
|
+
full_refresh=full_refresh,
|
|
79
|
+
)
|
|
80
|
+
console.print(f"[green] ✓ {src_cfg.name} done[/green]")
|
|
81
|
+
except Exception as exc:
|
|
82
|
+
console.print(f"[red] ✗ {src_cfg.name} failed: {exc}[/red]")
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _parse_steps(raw: str | None) -> list[str]:
|
|
86
|
+
if raw is None:
|
|
87
|
+
return list(VALID_STEPS)
|
|
88
|
+
parts = [s.strip() for s in raw.split(",")]
|
|
89
|
+
for s in parts:
|
|
90
|
+
if s not in VALID_STEPS:
|
|
91
|
+
raise click.ClickException(f"Unknown step '{s}'. Valid: {', '.join(VALID_STEPS)}")
|
|
92
|
+
return parts
|