vacancies-parser-kit 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. vacancies_parser_kit-0.1.0/.gitignore +10 -0
  2. vacancies_parser_kit-0.1.0/CLAUDE.md +88 -0
  3. vacancies_parser_kit-0.1.0/PKG-INFO +28 -0
  4. vacancies_parser_kit-0.1.0/pyproject.toml +43 -0
  5. vacancies_parser_kit-0.1.0/setup.cfg +4 -0
  6. vacancies_parser_kit-0.1.0/tests/__init__.py +0 -0
  7. vacancies_parser_kit-0.1.0/tests/test_headhunter.py +121 -0
  8. vacancies_parser_kit-0.1.0/tests/test_selector.py +52 -0
  9. vacancies_parser_kit-0.1.0/vacancies_parser_kit.egg-info/PKG-INFO +28 -0
  10. vacancies_parser_kit-0.1.0/vacancies_parser_kit.egg-info/SOURCES.txt +43 -0
  11. vacancies_parser_kit-0.1.0/vacancies_parser_kit.egg-info/dependency_links.txt +1 -0
  12. vacancies_parser_kit-0.1.0/vacancies_parser_kit.egg-info/entry_points.txt +2 -0
  13. vacancies_parser_kit-0.1.0/vacancies_parser_kit.egg-info/requires.txt +29 -0
  14. vacancies_parser_kit-0.1.0/vacancies_parser_kit.egg-info/top_level.txt +1 -0
  15. vacancies_parser_kit-0.1.0/vpr/__init__.py +3 -0
  16. vacancies_parser_kit-0.1.0/vpr/cli/__init__.py +0 -0
  17. vacancies_parser_kit-0.1.0/vpr/cli/commands/__init__.py +0 -0
  18. vacancies_parser_kit-0.1.0/vpr/cli/commands/init.py +29 -0
  19. vacancies_parser_kit-0.1.0/vpr/cli/commands/list.py +59 -0
  20. vacancies_parser_kit-0.1.0/vpr/cli/commands/run.py +92 -0
  21. vacancies_parser_kit-0.1.0/vpr/cli/main.py +19 -0
  22. vacancies_parser_kit-0.1.0/vpr/config/__init__.py +0 -0
  23. vacancies_parser_kit-0.1.0/vpr/config/profile.py +50 -0
  24. vacancies_parser_kit-0.1.0/vpr/config/project.py +41 -0
  25. vacancies_parser_kit-0.1.0/vpr/config/source_config.py +46 -0
  26. vacancies_parser_kit-0.1.0/vpr/core/__init__.py +0 -0
  27. vacancies_parser_kit-0.1.0/vpr/core/models.py +66 -0
  28. vacancies_parser_kit-0.1.0/vpr/core/pipeline.py +59 -0
  29. vacancies_parser_kit-0.1.0/vpr/core/selector.py +35 -0
  30. vacancies_parser_kit-0.1.0/vpr/sources/__init__.py +3 -0
  31. vacancies_parser_kit-0.1.0/vpr/sources/base.py +28 -0
  32. vacancies_parser_kit-0.1.0/vpr/sources/headhunter.py +254 -0
  33. vacancies_parser_kit-0.1.0/vpr/sources/hh_dictionaries.py +82 -0
  34. vacancies_parser_kit-0.1.0/vpr/sources/registry.py +27 -0
  35. vacancies_parser_kit-0.1.0/vpr/templates/__init__.py +3 -0
  36. vacancies_parser_kit-0.1.0/vpr/templates/scaffold.py +99 -0
  37. vacancies_parser_kit-0.1.0/vpr/transforms/__init__.py +0 -0
  38. vacancies_parser_kit-0.1.0/vpr/transforms/normalize.py +18 -0
  39. vacancies_parser_kit-0.1.0/vpr/writers/__init__.py +15 -0
  40. vacancies_parser_kit-0.1.0/vpr/writers/base.py +25 -0
  41. vacancies_parser_kit-0.1.0/vpr/writers/clickhouse.py +108 -0
  42. vacancies_parser_kit-0.1.0/vpr/writers/jsonl.py +28 -0
  43. vacancies_parser_kit-0.1.0/vpr/writers/postgres.py +132 -0
  44. vacancies_parser_kit-0.1.0/vpr/writers/registry.py +27 -0
  45. vacancies_parser_kit-0.1.0/vpr/writers/sqlite.py +114 -0
@@ -0,0 +1,10 @@
1
+ __pycache__/
2
+ *.py[cod]
3
+ *.egg-info/
4
+ dist/
5
+ build/
6
+ .venv/
7
+ .env
8
+ *.egg
9
+ .ruff_cache/
10
+ .pytest_cache/
@@ -0,0 +1,88 @@
1
+ # CLAUDE.md
2
+
3
+ This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
4
+
5
+ ## Project Overview
6
+
7
+ Multi-source vacancy/job-listing parser. Collects vacancies from heterogeneous sources (Telegram channels, HeadHunter API, LinkedIn, and other job platforms), normalises them into a unified schema, and writes them to pluggable storage backends.
8
+
9
+ CLI tool: **`vpr`** (vacancy parser). Design follows dbt/detectkit patterns — YAML-based source definitions, profiles, tag-based selection.
10
+
11
+ ## Key Design Decisions
12
+
13
+ - **Source-agnostic ingestion**: each source is a plugin that yields raw vacancy data. Sources may return unstructured or incomplete data — that's expected and must be handled gracefully (missing fields are nullable, never silently dropped).
14
+ - **Source instances in YAML**: each source is defined as a `.yml` file in `sources/`. Same source type (e.g. `headhunter`) can have multiple instances with different params.
15
+ - **Unified vacancy model**: all sources map to one canonical `Vacancy` record (`vpr/core/models.py`). Composite identity: `(source, source_id)`. Records support CRUD lifecycle states.
16
+ - **Per-source dictionaries**: sources may ship lookup/mapping dictionaries (e.g. HeadHunter area codes → city names) used during normalisation.
17
+ - **Pluggable storage**: writers are independent of sources. Targets defined in `profiles.yml`. Primary: ClickHouse, PostgreSQL.
18
+ - **No built-in scheduler**: execution via bash/cron/Prefect. CLI supports `--select` and tags for orchestration.
19
+
20
+ ## Architecture
21
+
22
+ ```
23
+ vpr/
24
+ ├── cli/ – Click CLI: vpr init / run / list
25
+ │ └── commands/ – one module per command
26
+ ├── config/ – YAML config loaders (project, profile, source)
27
+ ├── core/
28
+ │ ├── models.py – canonical Vacancy schema
29
+ │ ├── selector.py – --select / --exclude resolution (name, glob, tag:)
30
+ │ └── pipeline.py – fetch → transform → write orchestration
31
+ ├── sources/
32
+ │ ├── base.py – BaseSource ABC (async fetch → Vacancy iterator)
33
+ │ └── registry.py – @register_source decorator + lookup
34
+ ├── transforms/
35
+ │ └── normalize.py – raw → canonical normalisation
36
+ ├── writers/
37
+ │ ├── base.py – BaseWriter ABC (async write)
38
+ │ └── registry.py – @register_writer decorator + lookup
39
+ └── templates/ – scaffold for `vpr init`
40
+ ```
41
+
42
+ ## Development
43
+
44
+ ```bash
45
+ python3 -m venv .venv && source .venv/bin/activate
46
+ pip install -e ".[dev]" # editable install with dev deps
47
+ pip install -e ".[all]" # all source/writer extras
48
+ ```
49
+
50
+ ```bash
51
+ pytest # run all tests
52
+ pytest tests/test_selector.py # single test file
53
+ pytest -k "test_tag" # by test name pattern
54
+ ruff check vpr/ # lint
55
+ ruff format vpr/ # format
56
+ ```
57
+
58
+ ## CLI Usage
59
+
60
+ ```bash
61
+ vpr init <project-name> # scaffold a new project
62
+ vpr run --select <name|glob|tag:X> # run pipeline
63
+ vpr run --select tag:hh --exclude hh_test # with exclusion
64
+ vpr run --select "tg_*" --steps fetch # only fetch step
65
+ vpr list # table of all sources
66
+ vpr list --tags # grouped by tag
67
+ ```
68
+
69
+ ## Adding a New Source
70
+
71
+ 1. Create `vpr/sources/<type>.py`, subclass `BaseSource`, set `source_type = "<type>"`
72
+ 2. Decorate with `@register_source`
73
+ 3. Implement `async def fetch(self) -> AsyncIterator[Vacancy]`
74
+ 4. Import the module in `vpr/sources/__init__.py` so the decorator fires
75
+
76
+ ## Adding a New Writer
77
+
78
+ 1. Create `vpr/writers/<type>.py`, subclass `BaseWriter`, set `writer_type = "<type>"`
79
+ 2. Decorate with `@register_writer`
80
+ 3. Implement `async def write(self, vacancies: list[Vacancy]) -> int`
81
+ 4. Import the module in `vpr/writers/__init__.py`
82
+
83
+ ## Conventions
84
+
85
+ - All user-facing text and code comments in Russian are acceptable; code identifiers in English.
86
+ - Configuration via environment variables / `.env` files (never committed). Use `{{ env_var('NAME') }}` in profiles.yml.
87
+ - Each source and writer must be independently testable with a fixture/mock for the external service.
88
+ - Pipeline steps: `fetch`, `transform`, `write` — each can run independently via `--steps`.
@@ -0,0 +1,28 @@
1
+ Metadata-Version: 2.4
2
+ Name: vacancies-parser-kit
3
+ Version: 0.1.0
4
+ Summary: Multi-source vacancy parser with pluggable storage backends
5
+ License: MIT
6
+ Requires-Python: >=3.11
7
+ Requires-Dist: click>=8.1
8
+ Requires-Dist: pyyaml>=6.0
9
+ Requires-Dist: pydantic>=2.0
10
+ Requires-Dist: jinja2>=3.1
11
+ Requires-Dist: rich>=13.0
12
+ Provides-Extra: hh
13
+ Requires-Dist: httpx>=0.27; extra == "hh"
14
+ Provides-Extra: telegram
15
+ Requires-Dist: telethon>=1.36; extra == "telegram"
16
+ Provides-Extra: linkedin
17
+ Requires-Dist: httpx>=0.27; extra == "linkedin"
18
+ Provides-Extra: clickhouse
19
+ Requires-Dist: clickhouse-connect>=0.7; extra == "clickhouse"
20
+ Provides-Extra: postgres
21
+ Requires-Dist: asyncpg>=0.29; extra == "postgres"
22
+ Requires-Dist: psycopg[binary]>=3.1; extra == "postgres"
23
+ Provides-Extra: all
24
+ Requires-Dist: vpr[clickhouse,hh,linkedin,postgres,telegram]; extra == "all"
25
+ Provides-Extra: dev
26
+ Requires-Dist: pytest>=8.0; extra == "dev"
27
+ Requires-Dist: pytest-asyncio>=0.23; extra == "dev"
28
+ Requires-Dist: ruff>=0.4; extra == "dev"
@@ -0,0 +1,43 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68.0", "setuptools-scm>=8.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "vacancies-parser-kit"
7
+ version = "0.1.0"
8
+ description = "Multi-source vacancy parser with pluggable storage backends"
9
+ license = {text = "MIT"}
10
+ requires-python = ">=3.11"
11
+ dependencies = [
12
+ "click>=8.1",
13
+ "pyyaml>=6.0",
14
+ "pydantic>=2.0",
15
+ "jinja2>=3.1",
16
+ "rich>=13.0",
17
+ ]
18
+
19
+ [project.optional-dependencies]
20
+ hh = ["httpx>=0.27"]
21
+ telegram = ["telethon>=1.36"]
22
+ linkedin = ["httpx>=0.27"]
23
+ clickhouse = ["clickhouse-connect>=0.7"]
24
+ postgres = ["asyncpg>=0.29", "psycopg[binary]>=3.1"]
25
+ all = ["vpr[hh,telegram,linkedin,clickhouse,postgres]"]
26
+ dev = ["pytest>=8.0", "pytest-asyncio>=0.23", "ruff>=0.4"]
27
+
28
+ [project.scripts]
29
+ vpr = "vpr.cli.main:cli"
30
+
31
+ [tool.setuptools.packages.find]
32
+ include = ["vpr*"]
33
+
34
+ [tool.ruff]
35
+ target-version = "py311"
36
+ line-length = 120
37
+
38
+ [tool.ruff.lint]
39
+ select = ["E", "F", "I", "UP"]
40
+
41
+ [tool.pytest.ini_options]
42
+ asyncio_mode = "auto"
43
+ testpaths = ["tests"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
File without changes
@@ -0,0 +1,121 @@
1
+ """Tests for HeadHunter source (mocked HTTP)."""
2
+
3
+ import json
4
+ from unittest.mock import AsyncMock, patch
5
+
6
+ import pytest
7
+
8
+ from vpr.config.source_config import SourceConfig
9
+ from vpr.sources.headhunter import HeadHunterSource
10
+
11
+
12
+ def _make_config(**params) -> SourceConfig:
13
+ return SourceConfig(name="test_hh", type="headhunter", params=params)
14
+
15
+
16
+ def _vacancy_json(vid: str, name: str = "Python Dev") -> dict:
17
+ return {
18
+ "id": vid,
19
+ "name": name,
20
+ "employer": {"name": "TestCorp"},
21
+ "area": {"id": "1", "name": "Москва"},
22
+ "salary": {"from": 100000, "to": 200000, "currency": "RUR", "gross": False},
23
+ "experience": {"id": "between1And3", "name": "1–3 года"},
24
+ "employment": {"id": "full", "name": "Полная занятость"},
25
+ "schedule": {"id": "remote", "name": "Удалённая работа"},
26
+ "published_at": "2026-04-05T10:00:00+0300",
27
+ "alternate_url": "https://hh.ru/vacancy/12345",
28
+ }
29
+
30
+
31
+ def _full_vacancy_json(vid: str) -> dict:
32
+ base = _vacancy_json(vid)
33
+ base["description"] = "<p>We need a Python developer</p>"
34
+ base["key_skills"] = [{"name": "Python"}, {"name": "FastAPI"}]
35
+ return base
36
+
37
+
38
+ def _search_response(items: list[dict], found: int | None = None) -> dict:
39
+ return {
40
+ "found": found or len(items),
41
+ "pages": 1,
42
+ "per_page": 100,
43
+ "page": 0,
44
+ "items": items,
45
+ }
46
+
47
+
48
+ class FakeResponse:
49
+ def __init__(self, data: dict, status_code: int = 200):
50
+ self._data = data
51
+ self.status_code = status_code
52
+ self.text = json.dumps(data)
53
+ self.content = self.text.encode()
54
+
55
+ def json(self):
56
+ return self._data
57
+
58
+
59
+ @pytest.mark.asyncio
60
+ async def test_fetch_with_enrich():
61
+ """Two-pass: search + enrich each vacancy."""
62
+ config = _make_config(search_text="python", enrich=True, request_delay=0)
63
+ source = HeadHunterSource(config)
64
+
65
+ search_resp = FakeResponse(_search_response([_vacancy_json("1"), _vacancy_json("2")]))
66
+ full_1 = FakeResponse(_full_vacancy_json("1"))
67
+ full_2 = FakeResponse(_full_vacancy_json("2"))
68
+
69
+ mock_client = AsyncMock()
70
+ mock_client.get = AsyncMock(side_effect=[search_resp, full_1, full_2])
71
+ mock_client.__aenter__ = AsyncMock(return_value=mock_client)
72
+ mock_client.__aexit__ = AsyncMock(return_value=False)
73
+
74
+ with patch.object(source, "_make_client", return_value=mock_client):
75
+ vacancies = [v async for v in source.fetch()]
76
+
77
+ assert len(vacancies) == 2
78
+ assert vacancies[0].source == "headhunter"
79
+ assert vacancies[0].source_id == "1"
80
+ assert vacancies[0].description == "<p>We need a Python developer</p>"
81
+ assert vacancies[0].skills == ["Python", "FastAPI"]
82
+ assert vacancies[0].salary.min == 100000
83
+ assert vacancies[0].salary.currency == "RUR"
84
+ assert vacancies[0].city == "Москва"
85
+
86
+
87
+ @pytest.mark.asyncio
88
+ async def test_fetch_without_enrich():
89
+ """Single pass: search only, no description/skills."""
90
+ config = _make_config(search_text="python", enrich=False, request_delay=0)
91
+ source = HeadHunterSource(config)
92
+
93
+ search_resp = FakeResponse(_search_response([_vacancy_json("1")]))
94
+
95
+ mock_client = AsyncMock()
96
+ mock_client.get = AsyncMock(return_value=search_resp)
97
+ mock_client.__aenter__ = AsyncMock(return_value=mock_client)
98
+ mock_client.__aexit__ = AsyncMock(return_value=False)
99
+
100
+ with patch.object(source, "_make_client", return_value=mock_client):
101
+ vacancies = [v async for v in source.fetch()]
102
+
103
+ assert len(vacancies) == 1
104
+ assert vacancies[0].description is None
105
+ assert vacancies[0].skills == []
106
+
107
+
108
+ @pytest.mark.asyncio
109
+ async def test_vacancy_mapping():
110
+ """Check all canonical fields are correctly mapped."""
111
+ v = HeadHunterSource._to_vacancy(_full_vacancy_json("42"))
112
+ assert v.source == "headhunter"
113
+ assert v.source_id == "42"
114
+ assert v.title == "Python Dev"
115
+ assert v.company == "TestCorp"
116
+ assert v.experience == "1–3 года"
117
+ assert v.employment_type == "Полная занятость"
118
+ assert v.schedule == "Удалённая работа"
119
+ assert v.url == "https://hh.ru/vacancy/12345"
120
+ assert "Python" in v.skills
121
+ assert v.raw["id"] == "42"
@@ -0,0 +1,52 @@
1
+ """Tests for source selector logic."""
2
+
3
+ from vpr.config.source_config import SourceConfig
4
+ from vpr.core.selector import select_sources
5
+
6
+
7
+ def _make(name: str, tags: list[str] | None = None) -> SourceConfig:
8
+ return SourceConfig(name=name, type="test", tags=tags or [])
9
+
10
+
11
+ SOURCES = [
12
+ _make("hh_backend", ["hh", "backend"]),
13
+ _make("hh_frontend", ["hh", "frontend"]),
14
+ _make("tg_python_jobs", ["telegram", "python"]),
15
+ _make("tg_devops_jobs", ["telegram", "devops"]),
16
+ _make("linkedin_eu", ["linkedin"]),
17
+ ]
18
+
19
+
20
+ def test_select_by_exact_name():
21
+ result = select_sources(SOURCES, "hh_backend")
22
+ assert [s.name for s in result] == ["hh_backend"]
23
+
24
+
25
+ def test_select_by_glob():
26
+ result = select_sources(SOURCES, "hh_*")
27
+ assert sorted(s.name for s in result) == ["hh_backend", "hh_frontend"]
28
+
29
+
30
+ def test_select_by_tag():
31
+ result = select_sources(SOURCES, "tag:telegram")
32
+ assert sorted(s.name for s in result) == ["tg_devops_jobs", "tg_python_jobs"]
33
+
34
+
35
+ def test_select_all_glob():
36
+ result = select_sources(SOURCES, "*")
37
+ assert len(result) == 5
38
+
39
+
40
+ def test_exclude():
41
+ result = select_sources(SOURCES, "tag:hh", excludes={"hh_frontend"})
42
+ assert [s.name for s in result] == ["hh_backend"]
43
+
44
+
45
+ def test_no_match():
46
+ result = select_sources(SOURCES, "nonexistent")
47
+ assert result == []
48
+
49
+
50
+ def test_exclude_all():
51
+ result = select_sources(SOURCES, "tag:hh", excludes={"hh_backend", "hh_frontend"})
52
+ assert result == []
@@ -0,0 +1,28 @@
1
+ Metadata-Version: 2.4
2
+ Name: vacancies-parser-kit
3
+ Version: 0.1.0
4
+ Summary: Multi-source vacancy parser with pluggable storage backends
5
+ License: MIT
6
+ Requires-Python: >=3.11
7
+ Requires-Dist: click>=8.1
8
+ Requires-Dist: pyyaml>=6.0
9
+ Requires-Dist: pydantic>=2.0
10
+ Requires-Dist: jinja2>=3.1
11
+ Requires-Dist: rich>=13.0
12
+ Provides-Extra: hh
13
+ Requires-Dist: httpx>=0.27; extra == "hh"
14
+ Provides-Extra: telegram
15
+ Requires-Dist: telethon>=1.36; extra == "telegram"
16
+ Provides-Extra: linkedin
17
+ Requires-Dist: httpx>=0.27; extra == "linkedin"
18
+ Provides-Extra: clickhouse
19
+ Requires-Dist: clickhouse-connect>=0.7; extra == "clickhouse"
20
+ Provides-Extra: postgres
21
+ Requires-Dist: asyncpg>=0.29; extra == "postgres"
22
+ Requires-Dist: psycopg[binary]>=3.1; extra == "postgres"
23
+ Provides-Extra: all
24
+ Requires-Dist: vpr[clickhouse,hh,linkedin,postgres,telegram]; extra == "all"
25
+ Provides-Extra: dev
26
+ Requires-Dist: pytest>=8.0; extra == "dev"
27
+ Requires-Dist: pytest-asyncio>=0.23; extra == "dev"
28
+ Requires-Dist: ruff>=0.4; extra == "dev"
@@ -0,0 +1,43 @@
1
+ .gitignore
2
+ CLAUDE.md
3
+ pyproject.toml
4
+ tests/__init__.py
5
+ tests/test_headhunter.py
6
+ tests/test_selector.py
7
+ vacancies_parser_kit.egg-info/PKG-INFO
8
+ vacancies_parser_kit.egg-info/SOURCES.txt
9
+ vacancies_parser_kit.egg-info/dependency_links.txt
10
+ vacancies_parser_kit.egg-info/entry_points.txt
11
+ vacancies_parser_kit.egg-info/requires.txt
12
+ vacancies_parser_kit.egg-info/top_level.txt
13
+ vpr/__init__.py
14
+ vpr/cli/__init__.py
15
+ vpr/cli/main.py
16
+ vpr/cli/commands/__init__.py
17
+ vpr/cli/commands/init.py
18
+ vpr/cli/commands/list.py
19
+ vpr/cli/commands/run.py
20
+ vpr/config/__init__.py
21
+ vpr/config/profile.py
22
+ vpr/config/project.py
23
+ vpr/config/source_config.py
24
+ vpr/core/__init__.py
25
+ vpr/core/models.py
26
+ vpr/core/pipeline.py
27
+ vpr/core/selector.py
28
+ vpr/sources/__init__.py
29
+ vpr/sources/base.py
30
+ vpr/sources/headhunter.py
31
+ vpr/sources/hh_dictionaries.py
32
+ vpr/sources/registry.py
33
+ vpr/templates/__init__.py
34
+ vpr/templates/scaffold.py
35
+ vpr/transforms/__init__.py
36
+ vpr/transforms/normalize.py
37
+ vpr/writers/__init__.py
38
+ vpr/writers/base.py
39
+ vpr/writers/clickhouse.py
40
+ vpr/writers/jsonl.py
41
+ vpr/writers/postgres.py
42
+ vpr/writers/registry.py
43
+ vpr/writers/sqlite.py
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ vpr = vpr.cli.main:cli
@@ -0,0 +1,29 @@
1
+ click>=8.1
2
+ pyyaml>=6.0
3
+ pydantic>=2.0
4
+ jinja2>=3.1
5
+ rich>=13.0
6
+
7
+ [all]
8
+ vpr[clickhouse,hh,linkedin,postgres,telegram]
9
+
10
+ [clickhouse]
11
+ clickhouse-connect>=0.7
12
+
13
+ [dev]
14
+ pytest>=8.0
15
+ pytest-asyncio>=0.23
16
+ ruff>=0.4
17
+
18
+ [hh]
19
+ httpx>=0.27
20
+
21
+ [linkedin]
22
+ httpx>=0.27
23
+
24
+ [postgres]
25
+ asyncpg>=0.29
26
+ psycopg[binary]>=3.1
27
+
28
+ [telegram]
29
+ telethon>=1.36
@@ -0,0 +1,3 @@
1
+ """vpr — multi-source vacancy parser."""
2
+
3
+ __version__ = "0.1.0"
File without changes
@@ -0,0 +1,29 @@
1
+ """vpr init — scaffold a new project."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+
7
+ import click
8
+ from rich.console import Console
9
+
10
+ from vpr.templates import render_project_scaffold
11
+
12
+ console = Console()
13
+
14
+
15
+ @click.command("init")
16
+ @click.argument("project_name")
17
+ def init_cmd(project_name: str) -> None:
18
+ """Create a new vpr project."""
19
+ target = Path.cwd() / project_name
20
+ if target.exists():
21
+ raise click.ClickException(f"Directory '{project_name}' already exists")
22
+
23
+ render_project_scaffold(target, project_name)
24
+ console.print(f"[green]Project '{project_name}' created at {target}[/green]")
25
+ console.print("Next steps:")
26
+ console.print(f" cd {project_name}")
27
+ console.print(" # edit profiles.yml with your database credentials")
28
+ console.print(" # add source definitions to sources/")
29
+ console.print(" vpr run --select <source_name>")
@@ -0,0 +1,59 @@
1
+ """vpr list — show all configured sources."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import click
6
+ from rich.console import Console
7
+ from rich.table import Table
8
+
9
+ from vpr.config.project import find_project_root, load_project_config
10
+ from vpr.config.source_config import load_source_configs
11
+
12
+ console = Console()
13
+
14
+
15
+ @click.command("list")
16
+ @click.option("--tags", is_flag=True, help="Group sources by tag")
17
+ def list_cmd(tags: bool) -> None:
18
+ """List all configured sources."""
19
+ project_root = find_project_root()
20
+ project_cfg = load_project_config(project_root)
21
+ sources = load_source_configs(project_root / project_cfg.paths.sources)
22
+
23
+ if not sources:
24
+ console.print("[yellow]No sources found.[/yellow]")
25
+ return
26
+
27
+ if tags:
28
+ _print_by_tags(sources)
29
+ else:
30
+ _print_table(sources)
31
+
32
+
33
+ def _print_table(sources) -> None:
34
+ table = Table(title="Sources")
35
+ table.add_column("Name", style="cyan")
36
+ table.add_column("Type", style="green")
37
+ table.add_column("Tags")
38
+ table.add_column("Targets")
39
+
40
+ for src in sorted(sources, key=lambda s: s.name):
41
+ table.add_row(
42
+ src.name,
43
+ src.type,
44
+ ", ".join(src.tags),
45
+ ", ".join(src.targets) if src.targets else "—",
46
+ )
47
+ console.print(table)
48
+
49
+
50
+ def _print_by_tags(sources) -> None:
51
+ tag_map: dict[str, list[str]] = {}
52
+ for src in sources:
53
+ for tag in src.tags:
54
+ tag_map.setdefault(tag, []).append(src.name)
55
+
56
+ for tag in sorted(tag_map):
57
+ console.print(f"[bold]tag:{tag}[/bold]")
58
+ for name in sorted(tag_map[tag]):
59
+ console.print(f" - {name}")
@@ -0,0 +1,92 @@
1
+ """vpr run — fetch, transform, and write vacancies."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ from typing import TYPE_CHECKING
7
+
8
+ import click
9
+ from rich.console import Console
10
+
11
+ from vpr.config.project import find_project_root, load_project_config
12
+ from vpr.config.profile import load_profiles
13
+ from vpr.config.source_config import load_source_configs
14
+ from vpr.core.selector import select_sources
15
+
16
+ if TYPE_CHECKING:
17
+ pass
18
+
19
+ console = Console()
20
+
21
+ VALID_STEPS = ("fetch", "transform", "write")
22
+
23
+
24
+ @click.command("run")
25
+ @click.option("--select", "selector", required=True, help="Source selector: name, glob pattern, or tag:<tag>")
26
+ @click.option("--exclude", "excludes", multiple=True, help="Exclude sources by name")
27
+ @click.option("--steps", default=None, help=f"Comma-separated pipeline steps: {', '.join(VALID_STEPS)}")
28
+ @click.option("--profile", default=None, help="Override default profile from vpr_project.yml")
29
+ @click.option("--full-refresh", is_flag=True, help="Drop existing data and re-fetch from scratch")
30
+ def run_cmd(
31
+ selector: str,
32
+ excludes: tuple[str, ...],
33
+ steps: str | None,
34
+ profile: str | None,
35
+ full_refresh: bool,
36
+ ) -> None:
37
+ """Run the vacancy pipeline for selected sources."""
38
+ active_steps = _parse_steps(steps)
39
+
40
+ project_root = find_project_root()
41
+ project_cfg = load_project_config(project_root)
42
+ profiles_cfg = load_profiles(project_root)
43
+ all_sources = load_source_configs(project_root / project_cfg.paths.sources)
44
+
45
+ if not all_sources:
46
+ raise click.ClickException(f"No source configs found in {project_cfg.paths.sources}/")
47
+
48
+ matched = select_sources(all_sources, selector, excludes=set(excludes))
49
+ if not matched:
50
+ raise click.ClickException(f"No sources matched selector '{selector}'")
51
+
52
+ profile_name = profile or project_cfg.default_profile
53
+ if profile_name not in profiles_cfg.profiles:
54
+ raise click.ClickException(f"Profile '{profile_name}' not found in profiles.yml")
55
+
56
+ console.print(f"[bold]Profile:[/bold] {profile_name}")
57
+ console.print(f"[bold]Steps:[/bold] {', '.join(active_steps)}")
58
+ console.print(f"[bold]Sources ({len(matched)}):[/bold]")
59
+ for src in matched:
60
+ console.print(f" - {src.name} [dim]({src.type})[/dim]")
61
+ console.print()
62
+
63
+ asyncio.run(_run_pipeline(matched, profiles_cfg, profile_name, active_steps, full_refresh))
64
+
65
+
66
+ async def _run_pipeline(sources, profiles_cfg, profile_name, steps, full_refresh):
67
+ """Execute the pipeline for each matched source."""
68
+ from vpr.core.pipeline import run_source_pipeline
69
+
70
+ profile = profiles_cfg.profiles[profile_name]
71
+ for src_cfg in sources:
72
+ console.print(f"[bold cyan]>>> {src_cfg.name}[/bold cyan]")
73
+ try:
74
+ await run_source_pipeline(
75
+ source_config=src_cfg,
76
+ profile=profile,
77
+ steps=steps,
78
+ full_refresh=full_refresh,
79
+ )
80
+ console.print(f"[green] ✓ {src_cfg.name} done[/green]")
81
+ except Exception as exc:
82
+ console.print(f"[red] ✗ {src_cfg.name} failed: {exc}[/red]")
83
+
84
+
85
+ def _parse_steps(raw: str | None) -> list[str]:
86
+ if raw is None:
87
+ return list(VALID_STEPS)
88
+ parts = [s.strip() for s in raw.split(",")]
89
+ for s in parts:
90
+ if s not in VALID_STEPS:
91
+ raise click.ClickException(f"Unknown step '{s}'. Valid: {', '.join(VALID_STEPS)}")
92
+ return parts