vacancies-parser-kit 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vacancies_parser_kit-0.1.0.dist-info/METADATA +28 -0
- vacancies_parser_kit-0.1.0.dist-info/RECORD +36 -0
- vacancies_parser_kit-0.1.0.dist-info/WHEEL +5 -0
- vacancies_parser_kit-0.1.0.dist-info/entry_points.txt +2 -0
- vacancies_parser_kit-0.1.0.dist-info/top_level.txt +1 -0
- vpr/__init__.py +3 -0
- vpr/cli/__init__.py +0 -0
- vpr/cli/commands/__init__.py +0 -0
- vpr/cli/commands/init.py +29 -0
- vpr/cli/commands/list.py +59 -0
- vpr/cli/commands/run.py +92 -0
- vpr/cli/main.py +19 -0
- vpr/config/__init__.py +0 -0
- vpr/config/profile.py +50 -0
- vpr/config/project.py +41 -0
- vpr/config/source_config.py +46 -0
- vpr/core/__init__.py +0 -0
- vpr/core/models.py +66 -0
- vpr/core/pipeline.py +59 -0
- vpr/core/selector.py +35 -0
- vpr/sources/__init__.py +3 -0
- vpr/sources/base.py +28 -0
- vpr/sources/headhunter.py +254 -0
- vpr/sources/hh_dictionaries.py +82 -0
- vpr/sources/registry.py +27 -0
- vpr/templates/__init__.py +3 -0
- vpr/templates/scaffold.py +99 -0
- vpr/transforms/__init__.py +0 -0
- vpr/transforms/normalize.py +18 -0
- vpr/writers/__init__.py +15 -0
- vpr/writers/base.py +25 -0
- vpr/writers/clickhouse.py +108 -0
- vpr/writers/jsonl.py +28 -0
- vpr/writers/postgres.py +132 -0
- vpr/writers/registry.py +27 -0
- vpr/writers/sqlite.py +114 -0
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: vacancies-parser-kit
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Multi-source vacancy parser with pluggable storage backends
|
|
5
|
+
License: MIT
|
|
6
|
+
Requires-Python: >=3.11
|
|
7
|
+
Requires-Dist: click>=8.1
|
|
8
|
+
Requires-Dist: pyyaml>=6.0
|
|
9
|
+
Requires-Dist: pydantic>=2.0
|
|
10
|
+
Requires-Dist: jinja2>=3.1
|
|
11
|
+
Requires-Dist: rich>=13.0
|
|
12
|
+
Provides-Extra: hh
|
|
13
|
+
Requires-Dist: httpx>=0.27; extra == "hh"
|
|
14
|
+
Provides-Extra: telegram
|
|
15
|
+
Requires-Dist: telethon>=1.36; extra == "telegram"
|
|
16
|
+
Provides-Extra: linkedin
|
|
17
|
+
Requires-Dist: httpx>=0.27; extra == "linkedin"
|
|
18
|
+
Provides-Extra: clickhouse
|
|
19
|
+
Requires-Dist: clickhouse-connect>=0.7; extra == "clickhouse"
|
|
20
|
+
Provides-Extra: postgres
|
|
21
|
+
Requires-Dist: asyncpg>=0.29; extra == "postgres"
|
|
22
|
+
Requires-Dist: psycopg[binary]>=3.1; extra == "postgres"
|
|
23
|
+
Provides-Extra: all
|
|
24
|
+
Requires-Dist: vpr[clickhouse,hh,linkedin,postgres,telegram]; extra == "all"
|
|
25
|
+
Provides-Extra: dev
|
|
26
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
27
|
+
Requires-Dist: pytest-asyncio>=0.23; extra == "dev"
|
|
28
|
+
Requires-Dist: ruff>=0.4; extra == "dev"
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
vpr/__init__.py,sha256=-8BZGB5_iPPXXECpGVGmU9uYVVlft3YVyeULLyn_gbQ,66
|
|
2
|
+
vpr/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
+
vpr/cli/main.py,sha256=Oxruo2Lgez8aL8phbiqSK9u7-mxbTYW_0Puu0fMcwX8,438
|
|
4
|
+
vpr/cli/commands/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
5
|
+
vpr/cli/commands/init.py,sha256=df2mxEm7DvQVj95gVTH8r94gaXc9lmqNq2AxTMhMMnM,898
|
|
6
|
+
vpr/cli/commands/list.py,sha256=rNgp6r0Y89FzBX14dM7FH2Oqv4-YvntrbknqXPn80oU,1648
|
|
7
|
+
vpr/cli/commands/run.py,sha256=6axJ2LkOpsqSAylCn4OV4qvf8xy-0shLJ7cAt_vm0kw,3424
|
|
8
|
+
vpr/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
|
+
vpr/config/profile.py,sha256=YdTQ7-oSX_zaA6M-FvrFY_YkXA_qLUKUTUeE_3FxTyg,1299
|
|
10
|
+
vpr/config/project.py,sha256=zaNcWI8IQun1SDEpbsICSFcTLIo3WkNUHmgwsw-vze0,1130
|
|
11
|
+
vpr/config/source_config.py,sha256=yh2kxSXLKRB6hPlrWe9mKRncsq5D3HKhBsvtWm-iDdg,1278
|
|
12
|
+
vpr/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
13
|
+
vpr/core/models.py,sha256=g8y32wqDOwDM9VEKEKs4cgUzFFie_MYB0MavM7kHmd0,1663
|
|
14
|
+
vpr/core/pipeline.py,sha256=Kfb6ruXIGCpb-75kyUrqXjqMx8ow8CuDspz_8Uf5Gz8,1990
|
|
15
|
+
vpr/core/selector.py,sha256=qeH45-y08eR2q0aoDSUeAF2A-j9GksScxnKYAczL5xc,1068
|
|
16
|
+
vpr/sources/__init__.py,sha256=JIbkAFKHpwgVTiTgN8uC7JPAUUilVDNSkl952Kf4UEg,119
|
|
17
|
+
vpr/sources/base.py,sha256=A7gqRKYTBoMgFvVBqRqpWRKVN1zM2rAsSO_A7N-nGAU,802
|
|
18
|
+
vpr/sources/headhunter.py,sha256=cRip32dXf3qVKe3aqbG1kFj9CqWiQL5LovjzMMnTguA,9606
|
|
19
|
+
vpr/sources/hh_dictionaries.py,sha256=wrCPGM8aA9IpOtjCTV1u9PHSKjDRRhciiVzHp3qvHdM,2405
|
|
20
|
+
vpr/sources/registry.py,sha256=oKusJkHrSlgT5aym9eDcbHv9W9q65HcR55ZWuljxZE0,757
|
|
21
|
+
vpr/templates/__init__.py,sha256=HoBzfwDVFdQ3x3cBTBsWjmC3-vfV2aN92rAGklgICnU,98
|
|
22
|
+
vpr/templates/scaffold.py,sha256=Z2vyN9s5Nv0NaIHxj9sRIQpwnySYww7jEB7r7hNfyHA,2856
|
|
23
|
+
vpr/transforms/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
24
|
+
vpr/transforms/normalize.py,sha256=licHq8Hb74Pyt0Je2HAg4qnM_4yw2b59jFr0zyC5rts,519
|
|
25
|
+
vpr/writers/__init__.py,sha256=GxHSSrh1OUAJ5NMpSCOTWC-ldSHAYv0LWvlUy42xFXA,398
|
|
26
|
+
vpr/writers/base.py,sha256=PtdQzlYvOc49NoiGPUP8jecvcYit_zUR9wQQRgGtOCs,743
|
|
27
|
+
vpr/writers/clickhouse.py,sha256=E8gJhC7OiSvUcs40XuOyDSN-g5krhoeqRj6PCGG89Rc,3471
|
|
28
|
+
vpr/writers/jsonl.py,sha256=F6G3Lb94juXXqOOenxoxYLX7EohNiIoqTn_-7noAjQ4,821
|
|
29
|
+
vpr/writers/postgres.py,sha256=WscPr5d4uINUVvfAxfxETfpVcB3iO1Ymxbb3GHBLRgk,4132
|
|
30
|
+
vpr/writers/registry.py,sha256=T0MtCNvgsyxryIiIqFbcfOLskM_hoiIseD2l1jGWAS4,757
|
|
31
|
+
vpr/writers/sqlite.py,sha256=rwxVvAX6iF75v3L7XadrPaYvHCFboiENJD23JE6VmfE,3673
|
|
32
|
+
vacancies_parser_kit-0.1.0.dist-info/METADATA,sha256=WtYAcSdMfbYnNDHBhzcIPelW_WZ7-REYk9ilcjKs7i8,989
|
|
33
|
+
vacancies_parser_kit-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
34
|
+
vacancies_parser_kit-0.1.0.dist-info/entry_points.txt,sha256=egwVIIn97VkYCJ9ZgKTyk_8nIeuTCQe-tSEIKywo1Y8,41
|
|
35
|
+
vacancies_parser_kit-0.1.0.dist-info/top_level.txt,sha256=YtP4jygNOfY9lRC2fgxBLU-n1WVbODLp75tLRznx_0M,4
|
|
36
|
+
vacancies_parser_kit-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
vpr
|
vpr/__init__.py
ADDED
vpr/cli/__init__.py
ADDED
|
File without changes
|
|
File without changes
|
vpr/cli/commands/init.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""vpr init — scaffold a new project."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
import click
|
|
8
|
+
from rich.console import Console
|
|
9
|
+
|
|
10
|
+
from vpr.templates import render_project_scaffold
|
|
11
|
+
|
|
12
|
+
console = Console()
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@click.command("init")
|
|
16
|
+
@click.argument("project_name")
|
|
17
|
+
def init_cmd(project_name: str) -> None:
|
|
18
|
+
"""Create a new vpr project."""
|
|
19
|
+
target = Path.cwd() / project_name
|
|
20
|
+
if target.exists():
|
|
21
|
+
raise click.ClickException(f"Directory '{project_name}' already exists")
|
|
22
|
+
|
|
23
|
+
render_project_scaffold(target, project_name)
|
|
24
|
+
console.print(f"[green]Project '{project_name}' created at {target}[/green]")
|
|
25
|
+
console.print("Next steps:")
|
|
26
|
+
console.print(f" cd {project_name}")
|
|
27
|
+
console.print(" # edit profiles.yml with your database credentials")
|
|
28
|
+
console.print(" # add source definitions to sources/")
|
|
29
|
+
console.print(" vpr run --select <source_name>")
|
vpr/cli/commands/list.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""vpr list — show all configured sources."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import click
|
|
6
|
+
from rich.console import Console
|
|
7
|
+
from rich.table import Table
|
|
8
|
+
|
|
9
|
+
from vpr.config.project import find_project_root, load_project_config
|
|
10
|
+
from vpr.config.source_config import load_source_configs
|
|
11
|
+
|
|
12
|
+
console = Console()
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@click.command("list")
|
|
16
|
+
@click.option("--tags", is_flag=True, help="Group sources by tag")
|
|
17
|
+
def list_cmd(tags: bool) -> None:
|
|
18
|
+
"""List all configured sources."""
|
|
19
|
+
project_root = find_project_root()
|
|
20
|
+
project_cfg = load_project_config(project_root)
|
|
21
|
+
sources = load_source_configs(project_root / project_cfg.paths.sources)
|
|
22
|
+
|
|
23
|
+
if not sources:
|
|
24
|
+
console.print("[yellow]No sources found.[/yellow]")
|
|
25
|
+
return
|
|
26
|
+
|
|
27
|
+
if tags:
|
|
28
|
+
_print_by_tags(sources)
|
|
29
|
+
else:
|
|
30
|
+
_print_table(sources)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _print_table(sources) -> None:
|
|
34
|
+
table = Table(title="Sources")
|
|
35
|
+
table.add_column("Name", style="cyan")
|
|
36
|
+
table.add_column("Type", style="green")
|
|
37
|
+
table.add_column("Tags")
|
|
38
|
+
table.add_column("Targets")
|
|
39
|
+
|
|
40
|
+
for src in sorted(sources, key=lambda s: s.name):
|
|
41
|
+
table.add_row(
|
|
42
|
+
src.name,
|
|
43
|
+
src.type,
|
|
44
|
+
", ".join(src.tags),
|
|
45
|
+
", ".join(src.targets) if src.targets else "—",
|
|
46
|
+
)
|
|
47
|
+
console.print(table)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _print_by_tags(sources) -> None:
|
|
51
|
+
tag_map: dict[str, list[str]] = {}
|
|
52
|
+
for src in sources:
|
|
53
|
+
for tag in src.tags:
|
|
54
|
+
tag_map.setdefault(tag, []).append(src.name)
|
|
55
|
+
|
|
56
|
+
for tag in sorted(tag_map):
|
|
57
|
+
console.print(f"[bold]tag:{tag}[/bold]")
|
|
58
|
+
for name in sorted(tag_map[tag]):
|
|
59
|
+
console.print(f" - {name}")
|
vpr/cli/commands/run.py
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
"""vpr run — fetch, transform, and write vacancies."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
from typing import TYPE_CHECKING
|
|
7
|
+
|
|
8
|
+
import click
|
|
9
|
+
from rich.console import Console
|
|
10
|
+
|
|
11
|
+
from vpr.config.project import find_project_root, load_project_config
|
|
12
|
+
from vpr.config.profile import load_profiles
|
|
13
|
+
from vpr.config.source_config import load_source_configs
|
|
14
|
+
from vpr.core.selector import select_sources
|
|
15
|
+
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
pass
|
|
18
|
+
|
|
19
|
+
console = Console()
|
|
20
|
+
|
|
21
|
+
VALID_STEPS = ("fetch", "transform", "write")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@click.command("run")
|
|
25
|
+
@click.option("--select", "selector", required=True, help="Source selector: name, glob pattern, or tag:<tag>")
|
|
26
|
+
@click.option("--exclude", "excludes", multiple=True, help="Exclude sources by name")
|
|
27
|
+
@click.option("--steps", default=None, help=f"Comma-separated pipeline steps: {', '.join(VALID_STEPS)}")
|
|
28
|
+
@click.option("--profile", default=None, help="Override default profile from vpr_project.yml")
|
|
29
|
+
@click.option("--full-refresh", is_flag=True, help="Drop existing data and re-fetch from scratch")
|
|
30
|
+
def run_cmd(
|
|
31
|
+
selector: str,
|
|
32
|
+
excludes: tuple[str, ...],
|
|
33
|
+
steps: str | None,
|
|
34
|
+
profile: str | None,
|
|
35
|
+
full_refresh: bool,
|
|
36
|
+
) -> None:
|
|
37
|
+
"""Run the vacancy pipeline for selected sources."""
|
|
38
|
+
active_steps = _parse_steps(steps)
|
|
39
|
+
|
|
40
|
+
project_root = find_project_root()
|
|
41
|
+
project_cfg = load_project_config(project_root)
|
|
42
|
+
profiles_cfg = load_profiles(project_root)
|
|
43
|
+
all_sources = load_source_configs(project_root / project_cfg.paths.sources)
|
|
44
|
+
|
|
45
|
+
if not all_sources:
|
|
46
|
+
raise click.ClickException(f"No source configs found in {project_cfg.paths.sources}/")
|
|
47
|
+
|
|
48
|
+
matched = select_sources(all_sources, selector, excludes=set(excludes))
|
|
49
|
+
if not matched:
|
|
50
|
+
raise click.ClickException(f"No sources matched selector '{selector}'")
|
|
51
|
+
|
|
52
|
+
profile_name = profile or project_cfg.default_profile
|
|
53
|
+
if profile_name not in profiles_cfg.profiles:
|
|
54
|
+
raise click.ClickException(f"Profile '{profile_name}' not found in profiles.yml")
|
|
55
|
+
|
|
56
|
+
console.print(f"[bold]Profile:[/bold] {profile_name}")
|
|
57
|
+
console.print(f"[bold]Steps:[/bold] {', '.join(active_steps)}")
|
|
58
|
+
console.print(f"[bold]Sources ({len(matched)}):[/bold]")
|
|
59
|
+
for src in matched:
|
|
60
|
+
console.print(f" - {src.name} [dim]({src.type})[/dim]")
|
|
61
|
+
console.print()
|
|
62
|
+
|
|
63
|
+
asyncio.run(_run_pipeline(matched, profiles_cfg, profile_name, active_steps, full_refresh))
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
async def _run_pipeline(sources, profiles_cfg, profile_name, steps, full_refresh):
|
|
67
|
+
"""Execute the pipeline for each matched source."""
|
|
68
|
+
from vpr.core.pipeline import run_source_pipeline
|
|
69
|
+
|
|
70
|
+
profile = profiles_cfg.profiles[profile_name]
|
|
71
|
+
for src_cfg in sources:
|
|
72
|
+
console.print(f"[bold cyan]>>> {src_cfg.name}[/bold cyan]")
|
|
73
|
+
try:
|
|
74
|
+
await run_source_pipeline(
|
|
75
|
+
source_config=src_cfg,
|
|
76
|
+
profile=profile,
|
|
77
|
+
steps=steps,
|
|
78
|
+
full_refresh=full_refresh,
|
|
79
|
+
)
|
|
80
|
+
console.print(f"[green] ✓ {src_cfg.name} done[/green]")
|
|
81
|
+
except Exception as exc:
|
|
82
|
+
console.print(f"[red] ✗ {src_cfg.name} failed: {exc}[/red]")
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _parse_steps(raw: str | None) -> list[str]:
|
|
86
|
+
if raw is None:
|
|
87
|
+
return list(VALID_STEPS)
|
|
88
|
+
parts = [s.strip() for s in raw.split(",")]
|
|
89
|
+
for s in parts:
|
|
90
|
+
if s not in VALID_STEPS:
|
|
91
|
+
raise click.ClickException(f"Unknown step '{s}'. Valid: {', '.join(VALID_STEPS)}")
|
|
92
|
+
return parts
|
vpr/cli/main.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""CLI entry point for vpr."""
|
|
2
|
+
|
|
3
|
+
import click
|
|
4
|
+
|
|
5
|
+
from vpr import __version__
|
|
6
|
+
from vpr.cli.commands.init import init_cmd
|
|
7
|
+
from vpr.cli.commands.list import list_cmd
|
|
8
|
+
from vpr.cli.commands.run import run_cmd
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@click.group()
|
|
12
|
+
@click.version_option(version=__version__, prog_name="vpr")
|
|
13
|
+
def cli():
|
|
14
|
+
"""vpr — multi-source vacancy parser."""
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
cli.add_command(init_cmd, "init")
|
|
18
|
+
cli.add_command(run_cmd, "run")
|
|
19
|
+
cli.add_command(list_cmd, "list")
|
vpr/config/__init__.py
ADDED
|
File without changes
|
vpr/config/profile.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""Profiles configuration (profiles.yml)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import re
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
import yaml
|
|
11
|
+
from pydantic import BaseModel
|
|
12
|
+
|
|
13
|
+
_ENV_VAR_RE = re.compile(r"\{\{\s*env_var\(['\"](\w+)['\"]\)\s*\}\}")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class TargetConfig(BaseModel):
|
|
17
|
+
type: str
|
|
18
|
+
params: dict[str, Any] = {}
|
|
19
|
+
|
|
20
|
+
model_config = {"extra": "allow"}
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class ProfileConfig(BaseModel):
|
|
24
|
+
targets: dict[str, TargetConfig]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class ProfilesConfig(BaseModel):
|
|
28
|
+
default_profile: str = "dev"
|
|
29
|
+
profiles: dict[str, ProfileConfig]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _resolve_env_vars(obj: Any) -> Any:
|
|
33
|
+
"""Recursively resolve {{ env_var('NAME') }} placeholders."""
|
|
34
|
+
if isinstance(obj, str):
|
|
35
|
+
return _ENV_VAR_RE.sub(lambda m: os.environ.get(m.group(1), ""), obj)
|
|
36
|
+
if isinstance(obj, dict):
|
|
37
|
+
return {k: _resolve_env_vars(v) for k, v in obj.items()}
|
|
38
|
+
if isinstance(obj, list):
|
|
39
|
+
return [_resolve_env_vars(v) for v in obj]
|
|
40
|
+
return obj
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def load_profiles(project_root: Path) -> ProfilesConfig:
|
|
44
|
+
path = project_root / "profiles.yml"
|
|
45
|
+
if not path.is_file():
|
|
46
|
+
raise FileNotFoundError(f"profiles.yml not found in {project_root}")
|
|
47
|
+
with open(path) as f:
|
|
48
|
+
data = yaml.safe_load(f)
|
|
49
|
+
data = _resolve_env_vars(data)
|
|
50
|
+
return ProfilesConfig(**data)
|
vpr/config/project.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""Project-level configuration (vpr_project.yml)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
import yaml
|
|
8
|
+
from pydantic import BaseModel, Field
|
|
9
|
+
|
|
10
|
+
PROJECT_FILE = "vpr_project.yml"
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ProjectPaths(BaseModel):
|
|
14
|
+
sources: str = "sources"
|
|
15
|
+
dictionaries: str = "dictionaries"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class ProjectConfig(BaseModel):
|
|
19
|
+
name: str
|
|
20
|
+
version: str = "1.0"
|
|
21
|
+
default_profile: str = "dev"
|
|
22
|
+
paths: ProjectPaths = Field(default_factory=ProjectPaths)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def find_project_root(start: Path | None = None) -> Path:
|
|
26
|
+
"""Walk up from *start* (default cwd) until vpr_project.yml is found."""
|
|
27
|
+
current = (start or Path.cwd()).resolve()
|
|
28
|
+
for directory in (current, *current.parents):
|
|
29
|
+
if (directory / PROJECT_FILE).is_file():
|
|
30
|
+
return directory
|
|
31
|
+
raise FileNotFoundError(
|
|
32
|
+
f"Could not find {PROJECT_FILE}. "
|
|
33
|
+
"Are you inside a vpr project? Run 'vpr init <name>' to create one."
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def load_project_config(project_root: Path) -> ProjectConfig:
|
|
38
|
+
path = project_root / PROJECT_FILE
|
|
39
|
+
with open(path) as f:
|
|
40
|
+
data = yaml.safe_load(f)
|
|
41
|
+
return ProjectConfig(**data)
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""Source instance configuration (sources/*.yml)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
import yaml
|
|
9
|
+
from pydantic import BaseModel, Field
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class SourceConfig(BaseModel):
|
|
13
|
+
"""A single source instance loaded from YAML."""
|
|
14
|
+
|
|
15
|
+
name: str
|
|
16
|
+
type: str
|
|
17
|
+
tags: list[str] = Field(default_factory=list)
|
|
18
|
+
profile: str | None = None
|
|
19
|
+
params: dict[str, Any] = Field(default_factory=dict)
|
|
20
|
+
targets: list[str] = Field(default_factory=list)
|
|
21
|
+
enabled: bool = True
|
|
22
|
+
|
|
23
|
+
# metadata — filled at load time
|
|
24
|
+
config_path: Path | None = Field(default=None, exclude=True)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def load_source_config(path: Path) -> SourceConfig:
|
|
28
|
+
with open(path) as f:
|
|
29
|
+
data = yaml.safe_load(f)
|
|
30
|
+
if not data:
|
|
31
|
+
raise ValueError(f"Empty source config: {path}")
|
|
32
|
+
cfg = SourceConfig(**data)
|
|
33
|
+
cfg.config_path = path
|
|
34
|
+
return cfg
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def load_source_configs(sources_dir: Path) -> list[SourceConfig]:
|
|
38
|
+
"""Load all *.yml files from *sources_dir* recursively."""
|
|
39
|
+
if not sources_dir.is_dir():
|
|
40
|
+
return []
|
|
41
|
+
configs: list[SourceConfig] = []
|
|
42
|
+
for yml_path in sorted(sources_dir.rglob("*.yml")):
|
|
43
|
+
cfg = load_source_config(yml_path)
|
|
44
|
+
if cfg.enabled:
|
|
45
|
+
configs.append(cfg)
|
|
46
|
+
return configs
|
vpr/core/__init__.py
ADDED
|
File without changes
|
vpr/core/models.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"""Canonical vacancy model — unified schema all sources map into."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from datetime import UTC, datetime
|
|
6
|
+
from enum import Enum
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from pydantic import BaseModel, Field
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class VacancyStatus(str, Enum):
|
|
13
|
+
CREATED = "created"
|
|
14
|
+
UPDATED = "updated"
|
|
15
|
+
DELETED = "deleted"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class Salary(BaseModel):
|
|
19
|
+
min: float | None = None
|
|
20
|
+
max: float | None = None
|
|
21
|
+
currency: str | None = None
|
|
22
|
+
gross: bool | None = None
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class Vacancy(BaseModel):
|
|
26
|
+
"""Canonical vacancy record.
|
|
27
|
+
|
|
28
|
+
Composite identity: (source, source_id).
|
|
29
|
+
All fields except source/source_id are nullable — sources may provide
|
|
30
|
+
incomplete data and that's OK.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
# --- identity ---
|
|
34
|
+
source: str
|
|
35
|
+
source_id: str
|
|
36
|
+
|
|
37
|
+
# --- core ---
|
|
38
|
+
title: str | None = None
|
|
39
|
+
company: str | None = None
|
|
40
|
+
description: str | None = None
|
|
41
|
+
url: str | None = None
|
|
42
|
+
|
|
43
|
+
# --- location ---
|
|
44
|
+
city: str | None = None
|
|
45
|
+
region: str | None = None
|
|
46
|
+
country: str | None = None
|
|
47
|
+
is_remote: bool | None = None
|
|
48
|
+
|
|
49
|
+
# --- compensation ---
|
|
50
|
+
salary: Salary | None = None
|
|
51
|
+
|
|
52
|
+
# --- classification ---
|
|
53
|
+
experience: str | None = None
|
|
54
|
+
employment_type: str | None = None # full-time, part-time, contract …
|
|
55
|
+
schedule: str | None = None # remote, office, hybrid …
|
|
56
|
+
skills: list[str] = Field(default_factory=list)
|
|
57
|
+
|
|
58
|
+
# --- timestamps ---
|
|
59
|
+
published_at: datetime | None = None
|
|
60
|
+
fetched_at: datetime = Field(default_factory=lambda: datetime.now(UTC))
|
|
61
|
+
|
|
62
|
+
# --- lifecycle ---
|
|
63
|
+
status: VacancyStatus = VacancyStatus.CREATED
|
|
64
|
+
|
|
65
|
+
# --- extra ---
|
|
66
|
+
raw: dict[str, Any] = Field(default_factory=dict)
|
vpr/core/pipeline.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""Pipeline: fetch → transform → write."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from rich.console import Console
|
|
6
|
+
|
|
7
|
+
from vpr.config.profile import ProfileConfig
|
|
8
|
+
from vpr.config.source_config import SourceConfig
|
|
9
|
+
from vpr.core.models import Vacancy
|
|
10
|
+
from vpr.sources.registry import get_source_class
|
|
11
|
+
from vpr.writers.registry import get_writer_class
|
|
12
|
+
|
|
13
|
+
console = Console()
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
async def run_source_pipeline(
|
|
17
|
+
*,
|
|
18
|
+
source_config: SourceConfig,
|
|
19
|
+
profile: ProfileConfig,
|
|
20
|
+
steps: list[str],
|
|
21
|
+
full_refresh: bool = False,
|
|
22
|
+
) -> None:
|
|
23
|
+
"""Execute fetch → transform → write for a single source."""
|
|
24
|
+
vacancies: list[Vacancy] = []
|
|
25
|
+
|
|
26
|
+
# --- FETCH ---
|
|
27
|
+
if "fetch" in steps:
|
|
28
|
+
console.print(" [dim]fetching…[/dim]")
|
|
29
|
+
source_cls = get_source_class(source_config.type)
|
|
30
|
+
source = source_cls(source_config)
|
|
31
|
+
async for vacancy in source.fetch():
|
|
32
|
+
vacancies.append(vacancy)
|
|
33
|
+
console.print(f" [dim]fetched {len(vacancies)} vacancies[/dim]")
|
|
34
|
+
|
|
35
|
+
# --- TRANSFORM ---
|
|
36
|
+
if "transform" in steps and vacancies:
|
|
37
|
+
console.print(" [dim]transforming…[/dim]")
|
|
38
|
+
from vpr.transforms.normalize import normalize
|
|
39
|
+
|
|
40
|
+
vacancies = [normalize(v) for v in vacancies]
|
|
41
|
+
|
|
42
|
+
# --- WRITE ---
|
|
43
|
+
if "write" in steps and vacancies:
|
|
44
|
+
targets = source_config.targets
|
|
45
|
+
if not targets:
|
|
46
|
+
targets = list(profile.targets.keys())
|
|
47
|
+
|
|
48
|
+
for target_name in targets:
|
|
49
|
+
if target_name not in profile.targets:
|
|
50
|
+
console.print(f" [yellow]target '{target_name}' not found in profile, skipping[/yellow]")
|
|
51
|
+
continue
|
|
52
|
+
target_cfg = profile.targets[target_name]
|
|
53
|
+
writer_cls = get_writer_class(target_cfg.type)
|
|
54
|
+
writer = writer_cls(target_cfg)
|
|
55
|
+
try:
|
|
56
|
+
count = await writer.write(vacancies)
|
|
57
|
+
console.print(f" [dim]wrote {count} rows → {target_name}[/dim]")
|
|
58
|
+
finally:
|
|
59
|
+
await writer.close()
|
vpr/core/selector.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"""Source selector: resolve --select / --exclude to a list of SourceConfig."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from fnmatch import fnmatch
|
|
6
|
+
|
|
7
|
+
from vpr.config.source_config import SourceConfig
|
|
8
|
+
|
|
9
|
+
TAG_PREFIX = "tag:"
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def select_sources(
|
|
13
|
+
all_sources: list[SourceConfig],
|
|
14
|
+
selector: str,
|
|
15
|
+
*,
|
|
16
|
+
excludes: set[str] | None = None,
|
|
17
|
+
) -> list[SourceConfig]:
|
|
18
|
+
"""Return sources matching *selector*, minus *excludes*.
|
|
19
|
+
|
|
20
|
+
Selector forms:
|
|
21
|
+
- ``tag:<tag>`` — all sources with given tag
|
|
22
|
+
- ``name_with_*`` — glob match on source name
|
|
23
|
+
- ``exact_name`` — exact match on source name
|
|
24
|
+
"""
|
|
25
|
+
excludes = excludes or set()
|
|
26
|
+
|
|
27
|
+
if selector.startswith(TAG_PREFIX):
|
|
28
|
+
tag = selector[len(TAG_PREFIX) :]
|
|
29
|
+
matched = [s for s in all_sources if tag in s.tags]
|
|
30
|
+
elif any(ch in selector for ch in ("*", "?", "[")):
|
|
31
|
+
matched = [s for s in all_sources if fnmatch(s.name, selector)]
|
|
32
|
+
else:
|
|
33
|
+
matched = [s for s in all_sources if s.name == selector]
|
|
34
|
+
|
|
35
|
+
return [s for s in matched if s.name not in excludes]
|
vpr/sources/__init__.py
ADDED
vpr/sources/base.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""Base class for vacancy sources."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
from typing import AsyncIterator
|
|
7
|
+
|
|
8
|
+
from vpr.config.source_config import SourceConfig
|
|
9
|
+
from vpr.core.models import Vacancy
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class BaseSource(ABC):
|
|
13
|
+
"""Every source plugin extends this class.
|
|
14
|
+
|
|
15
|
+
A source receives its YAML-defined ``params`` and yields raw Vacancy
|
|
16
|
+
objects (with at least ``source`` and ``source_id`` filled).
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
source_type: str # must match the ``type`` field in source YAML
|
|
20
|
+
|
|
21
|
+
def __init__(self, config: SourceConfig) -> None:
|
|
22
|
+
self.config = config
|
|
23
|
+
self.params = config.params
|
|
24
|
+
|
|
25
|
+
@abstractmethod
|
|
26
|
+
async def fetch(self) -> AsyncIterator[Vacancy]:
|
|
27
|
+
"""Yield vacancies from the external source."""
|
|
28
|
+
... # pragma: no cover
|