linkedin-apply-assistant 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/ISSUE_TEMPLATE/bug_report.yml +72 -0
- package/.github/ISSUE_TEMPLATE/config.yml +5 -0
- package/.github/ISSUE_TEMPLATE/config_help.yml +49 -0
- package/.github/ISSUE_TEMPLATE/docs.yml +40 -0
- package/.github/ISSUE_TEMPLATE/feature_request.yml +45 -0
- package/.github/ISSUE_TEMPLATE/safety_compliance.yml +48 -0
- package/.github/PULL_REQUEST_TEMPLATE.md +43 -0
- package/CHANGELOG.md +47 -0
- package/CODE_OF_CONDUCT.md +47 -0
- package/CONTRIBUTING.md +64 -0
- package/GOVERNANCE.md +41 -0
- package/LEGAL.md +38 -0
- package/LICENSE +22 -0
- package/MIGRATION.md +50 -0
- package/README.md +167 -0
- package/RELEASE_CHECKLIST.md +454 -0
- package/SAFETY.md +33 -0
- package/SECURITY.md +37 -0
- package/SUPPORT.md +44 -0
- package/THIRD_PARTY_NOTICES.md +67 -0
- package/bin/linkedin-apply-assistant.mjs +95 -0
- package/configs/config.example.yml +24 -0
- package/configs/qa_bank.example.yml +35 -0
- package/docs/apply.md +40 -0
- package/docs/assist.md +35 -0
- package/docs/browser-session.md +45 -0
- package/docs/ci-and-release-policy.md +105 -0
- package/docs/commands.md +176 -0
- package/docs/install-and-configuration.md +265 -0
- package/docs/registry-publication-strategy.md +169 -0
- package/docs/reports.md +35 -0
- package/docs/search.md +39 -0
- package/docs/troubleshooting.md +57 -0
- package/examples/dry_run_input.example.json +25 -0
- package/examples/reports/apply-audit.example.json +31 -0
- package/examples/reports/search-report.example.json +40 -0
- package/install.ps1 +178 -0
- package/package.json +59 -0
- package/pyproject.toml +51 -0
- package/src/linkedin_apply_assistant/__init__.py +8 -0
- package/src/linkedin_apply_assistant/apply_reports.py +229 -0
- package/src/linkedin_apply_assistant/ats_handlers.py +217 -0
- package/src/linkedin_apply_assistant/browser_sessions.py +155 -0
- package/src/linkedin_apply_assistant/cli.py +570 -0
- package/src/linkedin_apply_assistant/config.py +109 -0
- package/src/linkedin_apply_assistant/contracts.py +255 -0
- package/src/linkedin_apply_assistant/form_engine.py +180 -0
- package/src/linkedin_apply_assistant/linkedin_layer.py +436 -0
- package/src/linkedin_apply_assistant/page_actions.py +110 -0
- package/src/linkedin_apply_assistant/page_selectors.py +88 -0
- package/src/linkedin_apply_assistant/paths.py +135 -0
- package/src/linkedin_apply_assistant/qa_bank.py +352 -0
- package/src/linkedin_apply_assistant/redaction.py +119 -0
- package/src/linkedin_apply_assistant/safety.py +230 -0
- package/src/linkedin_apply_assistant/workflows.py +435 -0
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
"""Runtime path resolution for the standalone assistant."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
import os
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
try:
|
|
10
|
+
from platformdirs import user_cache_dir, user_config_dir, user_data_dir
|
|
11
|
+
except ModuleNotFoundError:
|
|
12
|
+
|
|
13
|
+
def _windows_base(env_name: str, fallback: Path) -> Path:
|
|
14
|
+
value = os.environ.get(env_name)
|
|
15
|
+
return Path(value).expanduser() if value else fallback
|
|
16
|
+
|
|
17
|
+
def user_config_dir(appname: str) -> str:
|
|
18
|
+
base = _windows_base("APPDATA", Path.home() / ".config")
|
|
19
|
+
return str(base / appname)
|
|
20
|
+
|
|
21
|
+
def user_data_dir(appname: str) -> str:
|
|
22
|
+
base = _windows_base("LOCALAPPDATA", Path.home() / ".local" / "share")
|
|
23
|
+
return str(base / appname)
|
|
24
|
+
|
|
25
|
+
def user_cache_dir(appname: str) -> str:
|
|
26
|
+
base = _windows_base("LOCALAPPDATA", Path.home() / ".cache")
|
|
27
|
+
return str(base / appname / "cache")
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
APP_NAME = "linkedin-apply-assistant"
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass(frozen=True)
|
|
34
|
+
class RuntimePaths:
|
|
35
|
+
"""Resolved paths for config, data, cache, browser profile, and outputs."""
|
|
36
|
+
|
|
37
|
+
workspace: Path | None
|
|
38
|
+
config_dir: Path
|
|
39
|
+
data_dir: Path
|
|
40
|
+
cache_dir: Path
|
|
41
|
+
config_file: Path
|
|
42
|
+
qa_bank_file: Path
|
|
43
|
+
browser_profile_dir: Path
|
|
44
|
+
output_dir: Path
|
|
45
|
+
reports_dir: Path
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _optional_path(value: str | Path | None) -> Path | None:
|
|
49
|
+
if value is None:
|
|
50
|
+
return None
|
|
51
|
+
return Path(value).expanduser()
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _resolve_under_workspace(workspace: Path, path: Path) -> Path:
|
|
55
|
+
if path.is_absolute():
|
|
56
|
+
return path
|
|
57
|
+
return workspace / path
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def resolve_runtime_paths(
|
|
61
|
+
workspace: str | Path | None = None,
|
|
62
|
+
config: str | Path | None = None,
|
|
63
|
+
qa_bank: str | Path | None = None,
|
|
64
|
+
browser_profile: str | Path | None = None,
|
|
65
|
+
output_dir: str | Path | None = None,
|
|
66
|
+
) -> RuntimePaths:
|
|
67
|
+
"""Resolve runtime paths without touching the filesystem."""
|
|
68
|
+
|
|
69
|
+
workspace_path = _optional_path(workspace)
|
|
70
|
+
if workspace_path is not None:
|
|
71
|
+
workspace_path = workspace_path.resolve()
|
|
72
|
+
config_dir = workspace_path / "configs"
|
|
73
|
+
data_dir = workspace_path / "data"
|
|
74
|
+
cache_dir = workspace_path / ".cache"
|
|
75
|
+
default_config = config_dir / "config.yml"
|
|
76
|
+
default_qa_bank = config_dir / "qa_bank.yml"
|
|
77
|
+
default_browser_profile = workspace_path / "browser-profile"
|
|
78
|
+
default_output = workspace_path / "output"
|
|
79
|
+
else:
|
|
80
|
+
config_dir = Path(user_config_dir(APP_NAME))
|
|
81
|
+
data_dir = Path(user_data_dir(APP_NAME))
|
|
82
|
+
cache_dir = Path(user_cache_dir(APP_NAME))
|
|
83
|
+
default_config = config_dir / "config.yml"
|
|
84
|
+
default_qa_bank = config_dir / "qa_bank.yml"
|
|
85
|
+
default_browser_profile = data_dir / "browser-profile"
|
|
86
|
+
default_output = data_dir / "output"
|
|
87
|
+
|
|
88
|
+
config_override = _optional_path(config)
|
|
89
|
+
qa_bank_override = _optional_path(qa_bank)
|
|
90
|
+
browser_override = _optional_path(browser_profile)
|
|
91
|
+
output_override = _optional_path(output_dir)
|
|
92
|
+
|
|
93
|
+
if workspace_path is not None:
|
|
94
|
+
if config_override is not None:
|
|
95
|
+
config_override = _resolve_under_workspace(workspace_path, config_override)
|
|
96
|
+
if qa_bank_override is not None:
|
|
97
|
+
qa_bank_override = _resolve_under_workspace(workspace_path, qa_bank_override)
|
|
98
|
+
if browser_override is not None:
|
|
99
|
+
browser_override = _resolve_under_workspace(workspace_path, browser_override)
|
|
100
|
+
if output_override is not None:
|
|
101
|
+
output_override = _resolve_under_workspace(workspace_path, output_override)
|
|
102
|
+
|
|
103
|
+
resolved_output = output_override or default_output
|
|
104
|
+
|
|
105
|
+
return RuntimePaths(
|
|
106
|
+
workspace=workspace_path,
|
|
107
|
+
config_dir=config_dir,
|
|
108
|
+
data_dir=data_dir,
|
|
109
|
+
cache_dir=cache_dir,
|
|
110
|
+
config_file=config_override or default_config,
|
|
111
|
+
qa_bank_file=qa_bank_override or default_qa_bank,
|
|
112
|
+
browser_profile_dir=browser_override or default_browser_profile,
|
|
113
|
+
output_dir=resolved_output,
|
|
114
|
+
reports_dir=resolved_output / "reports",
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def ensure_runtime_dirs(
|
|
119
|
+
paths: RuntimePaths,
|
|
120
|
+
*,
|
|
121
|
+
include_browser_profile: bool = False,
|
|
122
|
+
) -> RuntimePaths:
|
|
123
|
+
"""Create runtime directories that are safe for local package operation."""
|
|
124
|
+
|
|
125
|
+
for directory in (
|
|
126
|
+
paths.config_dir,
|
|
127
|
+
paths.data_dir,
|
|
128
|
+
paths.cache_dir,
|
|
129
|
+
paths.output_dir,
|
|
130
|
+
paths.reports_dir,
|
|
131
|
+
):
|
|
132
|
+
directory.mkdir(parents=True, exist_ok=True)
|
|
133
|
+
if include_browser_profile:
|
|
134
|
+
paths.browser_profile_dir.mkdir(parents=True, exist_ok=True)
|
|
135
|
+
return paths
|
|
@@ -0,0 +1,352 @@
|
|
|
1
|
+
"""Standalone Q&A matching with explicit data paths."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
from difflib import SequenceMatcher
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
import re
|
|
9
|
+
from typing import Any
|
|
10
|
+
import unicodedata
|
|
11
|
+
|
|
12
|
+
import yaml
|
|
13
|
+
|
|
14
|
+
from .paths import RuntimePaths
|
|
15
|
+
from .safety import domain_from_url, normalize_url_for_audit
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
MATCH_THRESHOLD = 0.75
|
|
19
|
+
|
|
20
|
+
STOPWORDS = {
|
|
21
|
+
"a",
|
|
22
|
+
"an",
|
|
23
|
+
"the",
|
|
24
|
+
"is",
|
|
25
|
+
"are",
|
|
26
|
+
"was",
|
|
27
|
+
"were",
|
|
28
|
+
"of",
|
|
29
|
+
"in",
|
|
30
|
+
"on",
|
|
31
|
+
"at",
|
|
32
|
+
"to",
|
|
33
|
+
"for",
|
|
34
|
+
"with",
|
|
35
|
+
"by",
|
|
36
|
+
"from",
|
|
37
|
+
"as",
|
|
38
|
+
"and",
|
|
39
|
+
"or",
|
|
40
|
+
"but",
|
|
41
|
+
"if",
|
|
42
|
+
"your",
|
|
43
|
+
"you",
|
|
44
|
+
"our",
|
|
45
|
+
"this",
|
|
46
|
+
"that",
|
|
47
|
+
"do",
|
|
48
|
+
"does",
|
|
49
|
+
"did",
|
|
50
|
+
"what",
|
|
51
|
+
"what's",
|
|
52
|
+
"how",
|
|
53
|
+
"have",
|
|
54
|
+
"has",
|
|
55
|
+
"be",
|
|
56
|
+
"been",
|
|
57
|
+
"being",
|
|
58
|
+
"will",
|
|
59
|
+
"would",
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def normalize(text: str) -> str:
|
|
64
|
+
"""Strip accents and punctuation, lowercase, and collapse whitespace."""
|
|
65
|
+
|
|
66
|
+
if not text:
|
|
67
|
+
return ""
|
|
68
|
+
normalized = unicodedata.normalize("NFKD", text)
|
|
69
|
+
normalized = "".join(ch for ch in normalized if not unicodedata.combining(ch))
|
|
70
|
+
normalized = normalized.lower()
|
|
71
|
+
normalized = re.sub(r"[^\w\s]", " ", normalized)
|
|
72
|
+
normalized = re.sub(r"\s+", " ", normalized)
|
|
73
|
+
return normalized.strip()
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def tokenize(text: str) -> set[str]:
|
|
77
|
+
"""Return content tokens for fuzzy question matching."""
|
|
78
|
+
|
|
79
|
+
return {token for token in normalize(text).split() if token not in STOPWORDS and len(token) > 1}
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def similarity(question: str, pattern: str) -> float:
|
|
83
|
+
"""Hybrid similarity using containment, token overlap, and sequence ratio."""
|
|
84
|
+
|
|
85
|
+
norm_q = normalize(question)
|
|
86
|
+
norm_p = normalize(pattern)
|
|
87
|
+
if len(norm_p) >= 4 and norm_p in norm_q:
|
|
88
|
+
return 0.95 + (len(norm_p) / max(len(norm_q), 1)) * 0.05
|
|
89
|
+
|
|
90
|
+
tokens_q = tokenize(question)
|
|
91
|
+
tokens_p = tokenize(pattern)
|
|
92
|
+
if not tokens_p:
|
|
93
|
+
return 0.0
|
|
94
|
+
token_score = len(tokens_q & tokens_p) / len(tokens_p)
|
|
95
|
+
seq_score = SequenceMatcher(None, norm_q, norm_p).ratio()
|
|
96
|
+
return max(token_score * 0.85 + seq_score * 0.15, seq_score)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class QABank:
|
|
100
|
+
"""Application Q&A knowledge bank backed by explicit package paths."""
|
|
101
|
+
|
|
102
|
+
def __init__(
|
|
103
|
+
self,
|
|
104
|
+
bank_file: str | Path | None = None,
|
|
105
|
+
pending_file: str | Path | None = None,
|
|
106
|
+
profile: dict[str, Any] | None = None,
|
|
107
|
+
) -> None:
|
|
108
|
+
self.bank_file = Path(bank_file).expanduser() if bank_file is not None else None
|
|
109
|
+
self.pending_file = Path(pending_file).expanduser() if pending_file is not None else None
|
|
110
|
+
self.profile = profile or {}
|
|
111
|
+
self.data = self._load()
|
|
112
|
+
self.session_unknowns: list[dict[str, Any]] = []
|
|
113
|
+
|
|
114
|
+
@classmethod
|
|
115
|
+
def from_runtime_paths(
|
|
116
|
+
cls,
|
|
117
|
+
paths: RuntimePaths,
|
|
118
|
+
profile: dict[str, Any] | None = None,
|
|
119
|
+
) -> "QABank":
|
|
120
|
+
"""Create a bank using standalone runtime locations."""
|
|
121
|
+
|
|
122
|
+
return cls(
|
|
123
|
+
bank_file=paths.qa_bank_file,
|
|
124
|
+
pending_file=paths.data_dir / "pending_questions.md",
|
|
125
|
+
profile=profile,
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
def _load(self) -> dict[str, Any]:
|
|
129
|
+
if self.bank_file is None or not self.bank_file.exists():
|
|
130
|
+
return {"qa_pairs": []}
|
|
131
|
+
parsed = yaml.safe_load(self.bank_file.read_text(encoding="utf-8"))
|
|
132
|
+
if parsed is None:
|
|
133
|
+
return {"qa_pairs": []}
|
|
134
|
+
if not isinstance(parsed, dict):
|
|
135
|
+
raise ValueError("Q&A bank root must be a mapping")
|
|
136
|
+
pairs = parsed.get("qa_pairs")
|
|
137
|
+
if pairs is None:
|
|
138
|
+
parsed["qa_pairs"] = []
|
|
139
|
+
elif not isinstance(pairs, list):
|
|
140
|
+
raise ValueError("qa_pairs must be a list")
|
|
141
|
+
return dict(parsed)
|
|
142
|
+
|
|
143
|
+
def _patterns_for(self, qa: dict[str, Any]) -> list[str]:
|
|
144
|
+
patterns = qa.get("patterns", qa.get("question_patterns", []))
|
|
145
|
+
if isinstance(patterns, str):
|
|
146
|
+
return [patterns]
|
|
147
|
+
if isinstance(patterns, list):
|
|
148
|
+
return [str(pattern) for pattern in patterns if str(pattern).strip()]
|
|
149
|
+
return []
|
|
150
|
+
|
|
151
|
+
def _field_type_for(self, qa: dict[str, Any]) -> str:
|
|
152
|
+
return str(qa.get("field_type") or qa.get("response_type") or "text")
|
|
153
|
+
|
|
154
|
+
def _substitute_placeholders(
|
|
155
|
+
self,
|
|
156
|
+
text: str,
|
|
157
|
+
context: dict[str, Any] | None = None,
|
|
158
|
+
) -> str:
|
|
159
|
+
if not text or "{" not in text:
|
|
160
|
+
return text
|
|
161
|
+
ctx = context or {}
|
|
162
|
+
replacements = {
|
|
163
|
+
"{company}": ctx.get("company", ""),
|
|
164
|
+
"{role}": ctx.get("role", ""),
|
|
165
|
+
"{portfolio}": self.profile.get("portfolio", ""),
|
|
166
|
+
"{linkedin}": self.profile.get("linkedin", ""),
|
|
167
|
+
"{github}": self.profile.get("github", ""),
|
|
168
|
+
"{email}": self.profile.get("email", ""),
|
|
169
|
+
"{phone}": self.profile.get("phone", ""),
|
|
170
|
+
"{full_name}": self.profile.get("full_name", ""),
|
|
171
|
+
"{first_name}": self.profile.get("first_name", ""),
|
|
172
|
+
"{last_name}": self.profile.get("last_name", ""),
|
|
173
|
+
}
|
|
174
|
+
for placeholder, value in replacements.items():
|
|
175
|
+
text = text.replace(placeholder, str(value))
|
|
176
|
+
return text
|
|
177
|
+
|
|
178
|
+
def find_answer(
|
|
179
|
+
self,
|
|
180
|
+
question_text: str,
|
|
181
|
+
field_type: str | None = None,
|
|
182
|
+
context: dict[str, Any] | None = None,
|
|
183
|
+
) -> dict[str, Any] | None:
|
|
184
|
+
"""Return a matched answer record, or ``None`` when no threshold match exists."""
|
|
185
|
+
|
|
186
|
+
if not question_text:
|
|
187
|
+
return None
|
|
188
|
+
|
|
189
|
+
def type_compatible(bank_type: str, requested: str | None) -> bool:
|
|
190
|
+
if not requested or not bank_type:
|
|
191
|
+
return True
|
|
192
|
+
bank_type = bank_type.lower()
|
|
193
|
+
requested = requested.lower()
|
|
194
|
+
if bank_type == requested:
|
|
195
|
+
return True
|
|
196
|
+
if bank_type == "radio_or_select" and requested in {"radio", "select"}:
|
|
197
|
+
return True
|
|
198
|
+
textish = {"text", "textarea", "email", "tel", "url", "number"}
|
|
199
|
+
return bank_type in textish and requested in textish
|
|
200
|
+
|
|
201
|
+
best: dict[str, Any] | None = None
|
|
202
|
+
best_score = 0.0
|
|
203
|
+
best_compatible: dict[str, Any] | None = None
|
|
204
|
+
best_compatible_score = 0.0
|
|
205
|
+
|
|
206
|
+
for qa in self.data.get("qa_pairs", []):
|
|
207
|
+
if not isinstance(qa, dict):
|
|
208
|
+
continue
|
|
209
|
+
bank_type = self._field_type_for(qa)
|
|
210
|
+
compatible = type_compatible(bank_type, field_type)
|
|
211
|
+
for pattern in self._patterns_for(qa):
|
|
212
|
+
score = similarity(question_text, pattern)
|
|
213
|
+
if field_type and compatible:
|
|
214
|
+
score = min(score * 1.05, 1.0)
|
|
215
|
+
candidate = {
|
|
216
|
+
"id": qa.get("id", "?"),
|
|
217
|
+
"answer": self._substitute_placeholders(str(qa.get("answer", "")), context),
|
|
218
|
+
"field_type": bank_type,
|
|
219
|
+
"matched_pattern": pattern,
|
|
220
|
+
"score": round(score, 3),
|
|
221
|
+
}
|
|
222
|
+
pattern_specificity = len(normalize(pattern))
|
|
223
|
+
best_specificity = len(normalize(best["matched_pattern"])) if best else -1
|
|
224
|
+
compatible_specificity = (
|
|
225
|
+
len(normalize(best_compatible["matched_pattern"])) if best_compatible else -1
|
|
226
|
+
)
|
|
227
|
+
if score > best_score or (
|
|
228
|
+
score == best_score and pattern_specificity > best_specificity
|
|
229
|
+
):
|
|
230
|
+
best_score = score
|
|
231
|
+
best = candidate
|
|
232
|
+
if compatible and (
|
|
233
|
+
score > best_compatible_score
|
|
234
|
+
or (
|
|
235
|
+
score == best_compatible_score
|
|
236
|
+
and pattern_specificity > compatible_specificity
|
|
237
|
+
)
|
|
238
|
+
):
|
|
239
|
+
best_compatible_score = score
|
|
240
|
+
best_compatible = candidate
|
|
241
|
+
|
|
242
|
+
if field_type:
|
|
243
|
+
if best_compatible and best_compatible_score >= MATCH_THRESHOLD:
|
|
244
|
+
return best_compatible
|
|
245
|
+
return None
|
|
246
|
+
if best and best_score >= MATCH_THRESHOLD:
|
|
247
|
+
return best
|
|
248
|
+
return None
|
|
249
|
+
|
|
250
|
+
def log_pending(
|
|
251
|
+
self,
|
|
252
|
+
question_text: str,
|
|
253
|
+
context: dict[str, Any] | None = None,
|
|
254
|
+
field_type: str | None = None,
|
|
255
|
+
is_required: bool = False,
|
|
256
|
+
) -> dict[str, Any]:
|
|
257
|
+
"""Record an unknown question in memory and append when a pending path exists."""
|
|
258
|
+
|
|
259
|
+
ctx = context or {}
|
|
260
|
+
normalized_url = normalize_url_for_audit(ctx.get("apply_url") or ctx.get("url") or "")
|
|
261
|
+
entry = {
|
|
262
|
+
"timestamp": datetime.now().strftime("%Y-%m-%d %H:%M"),
|
|
263
|
+
"question": question_text.strip(),
|
|
264
|
+
"company": ctx.get("company", "unknown"),
|
|
265
|
+
"role": ctx.get("role", "unknown"),
|
|
266
|
+
"ats": ctx.get("ats", "unknown"),
|
|
267
|
+
"domain": ctx.get("domain") or domain_from_url(normalized_url),
|
|
268
|
+
"field_type": field_type or "text",
|
|
269
|
+
"required": is_required,
|
|
270
|
+
}
|
|
271
|
+
self.session_unknowns.append(entry)
|
|
272
|
+
if self.pending_file is not None:
|
|
273
|
+
self._append_pending(entry)
|
|
274
|
+
return entry
|
|
275
|
+
|
|
276
|
+
def _append_pending(self, entry: dict[str, Any]) -> None:
|
|
277
|
+
if self.pending_file is None:
|
|
278
|
+
return
|
|
279
|
+
self.pending_file.parent.mkdir(parents=True, exist_ok=True)
|
|
280
|
+
existing_questions: set[str] = set()
|
|
281
|
+
if self.pending_file.exists():
|
|
282
|
+
content = self.pending_file.read_text(encoding="utf-8")
|
|
283
|
+
for line in content.splitlines():
|
|
284
|
+
match = re.match(r"^###\s+Q:\s*(.+?)\s*$", line)
|
|
285
|
+
if match:
|
|
286
|
+
existing_questions.add(normalize(match.group(1)))
|
|
287
|
+
|
|
288
|
+
if normalize(entry["question"]) in existing_questions:
|
|
289
|
+
self._increment_pending_counter(entry["question"], entry)
|
|
290
|
+
return
|
|
291
|
+
|
|
292
|
+
header_needed = not self.pending_file.exists() or self.pending_file.stat().st_size == 0
|
|
293
|
+
with self.pending_file.open("a", encoding="utf-8") as handle:
|
|
294
|
+
if header_needed:
|
|
295
|
+
handle.write(self._pending_header())
|
|
296
|
+
handle.write(self._format_pending_entry(entry))
|
|
297
|
+
|
|
298
|
+
def _pending_header(self) -> str:
|
|
299
|
+
return """# Pending Application Questions
|
|
300
|
+
|
|
301
|
+
These are questions the assistant encountered that are not in your selected Q&A bank yet.
|
|
302
|
+
Add a truthful answer below each question, then copy the final entry into your own Q&A bank.
|
|
303
|
+
|
|
304
|
+
Format for adding an answer:
|
|
305
|
+
```
|
|
306
|
+
**Answer:** Your answer here
|
|
307
|
+
**Field type:** text | textarea | number | select | radio_or_select
|
|
308
|
+
**Patterns:** synonym1, synonym2
|
|
309
|
+
```
|
|
310
|
+
|
|
311
|
+
---
|
|
312
|
+
|
|
313
|
+
"""
|
|
314
|
+
|
|
315
|
+
def _format_pending_entry(self, entry: dict[str, Any]) -> str:
|
|
316
|
+
seen_marker = f"[seen 1 time as of {entry['timestamp']}]"
|
|
317
|
+
return f"""### Q: {entry["question"]}
|
|
318
|
+
|
|
319
|
+
- **First seen:** {entry["timestamp"]}
|
|
320
|
+
- **First context:** {entry["company"]} - {entry["role"]} ({entry["ats"]})
|
|
321
|
+
- **Domain:** {entry["domain"] or "unknown"}
|
|
322
|
+
- **Field type:** {entry["field_type"]}
|
|
323
|
+
- **Required:** {entry["required"]}
|
|
324
|
+
- **Stats:** {seen_marker}
|
|
325
|
+
|
|
326
|
+
**Answer:** _(fill in here)_
|
|
327
|
+
|
|
328
|
+
**Patterns:** _(optional - add synonyms separated by commas)_
|
|
329
|
+
|
|
330
|
+
---
|
|
331
|
+
|
|
332
|
+
"""
|
|
333
|
+
|
|
334
|
+
def _increment_pending_counter(self, question: str, entry: dict[str, Any]) -> None:
|
|
335
|
+
if self.pending_file is None or not self.pending_file.exists():
|
|
336
|
+
return
|
|
337
|
+
lines = self.pending_file.read_text(encoding="utf-8").splitlines(keepends=True)
|
|
338
|
+
norm_target = normalize(question)
|
|
339
|
+
for i, line in enumerate(lines):
|
|
340
|
+
match = re.match(r"^###\s+Q:\s*(.+?)\s*$", line)
|
|
341
|
+
if not match or normalize(match.group(1)) != norm_target:
|
|
342
|
+
continue
|
|
343
|
+
for j in range(i, min(i + 20, len(lines))):
|
|
344
|
+
stat_match = re.match(
|
|
345
|
+
r"^- \*\*Stats:\*\* \[seen (\d+) times? as of [^\]]+\]\s*$",
|
|
346
|
+
lines[j],
|
|
347
|
+
)
|
|
348
|
+
if stat_match:
|
|
349
|
+
new_count = int(stat_match.group(1)) + 1
|
|
350
|
+
lines[j] = f"- **Stats:** [seen {new_count} times as of {entry['timestamp']}]\n"
|
|
351
|
+
self.pending_file.write_text("".join(lines), encoding="utf-8")
|
|
352
|
+
return
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
"""Central report redaction for local JSON and Markdown artifacts."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from collections.abc import Mapping, Sequence
|
|
6
|
+
import re
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from .form_engine import normalize_space
|
|
10
|
+
from .safety import normalize_url_for_audit
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
REDACTION_MARKER = "[REDACTED]"
|
|
14
|
+
MARKDOWN_VALUE_LIMIT = 180
|
|
15
|
+
|
|
16
|
+
_SENSITIVE_KEY_PARTS = (
|
|
17
|
+
"password",
|
|
18
|
+
"secret",
|
|
19
|
+
"token",
|
|
20
|
+
"cookie",
|
|
21
|
+
"credential",
|
|
22
|
+
"auth",
|
|
23
|
+
"session",
|
|
24
|
+
"browser_profile",
|
|
25
|
+
"browser-profile",
|
|
26
|
+
"raw_html",
|
|
27
|
+
"html",
|
|
28
|
+
"screenshot",
|
|
29
|
+
"resume_contents",
|
|
30
|
+
"cover_letter_contents",
|
|
31
|
+
"document_contents",
|
|
32
|
+
"phone_answer",
|
|
33
|
+
"email_answer",
|
|
34
|
+
"answer_phone",
|
|
35
|
+
"answer_email",
|
|
36
|
+
"application_history",
|
|
37
|
+
"candidate",
|
|
38
|
+
"profile",
|
|
39
|
+
"documents",
|
|
40
|
+
"raw_form",
|
|
41
|
+
"raw_state",
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
_SENSITIVE_EXACT_KEYS = {"answer", "email", "phone", "tel"}
|
|
45
|
+
|
|
46
|
+
_URL_KEYS = {"url", "apply_url", "search_url"}
|
|
47
|
+
|
|
48
|
+
_SENSITIVE_VALUE_PATTERNS = (
|
|
49
|
+
re.compile(r"\bBearer\s+[A-Za-z0-9._~+/=-]+", re.IGNORECASE),
|
|
50
|
+
re.compile(r"\bsessionid\s*=", re.IGNORECASE),
|
|
51
|
+
re.compile(r"\bcookie\s*[:=]", re.IGNORECASE),
|
|
52
|
+
re.compile(r"<\s*html\b", re.IGNORECASE),
|
|
53
|
+
re.compile(r"[\w.+-]+@[\w.-]+\.[A-Za-z]{2,}"),
|
|
54
|
+
re.compile(r"\+?\d[\d\s().-]{7,}\d"),
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _normalized_key(key: Any) -> str:
|
|
59
|
+
return re.sub(r"[\s-]+", "_", str(key or "").strip().lower())
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _is_sensitive_key(key: Any) -> bool:
|
|
63
|
+
normalized = _normalized_key(key)
|
|
64
|
+
if normalized in _SENSITIVE_EXACT_KEYS:
|
|
65
|
+
return True
|
|
66
|
+
return any(part in normalized for part in _SENSITIVE_KEY_PARTS)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _is_url_key(key: Any) -> bool:
|
|
70
|
+
normalized = _normalized_key(key)
|
|
71
|
+
return normalized in _URL_KEYS or normalized.endswith("_url")
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _is_sensitive_value(value: str) -> bool:
|
|
75
|
+
return any(pattern.search(value) for pattern in _SENSITIVE_VALUE_PATTERNS)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def sanitize_report_payload(payload: Any) -> Any:
|
|
79
|
+
"""Return a sanitized copy of a report payload without mutating input."""
|
|
80
|
+
|
|
81
|
+
return _sanitize_value(payload, key="")
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _sanitize_value(value: Any, *, key: Any) -> Any:
|
|
85
|
+
if _is_sensitive_key(key):
|
|
86
|
+
return REDACTION_MARKER
|
|
87
|
+
if isinstance(value, Mapping):
|
|
88
|
+
return {
|
|
89
|
+
str(item_key): _sanitize_value(item_value, key=item_key)
|
|
90
|
+
for item_key, item_value in value.items()
|
|
91
|
+
}
|
|
92
|
+
if isinstance(value, tuple):
|
|
93
|
+
return [_sanitize_value(item, key=key) for item in value]
|
|
94
|
+
if isinstance(value, Sequence) and not isinstance(value, (str, bytes, bytearray)):
|
|
95
|
+
return [_sanitize_value(item, key=key) for item in value]
|
|
96
|
+
if isinstance(value, str):
|
|
97
|
+
if _is_url_key(key):
|
|
98
|
+
return normalize_url_for_audit(value)
|
|
99
|
+
if _is_sensitive_value(value):
|
|
100
|
+
return REDACTION_MARKER
|
|
101
|
+
return value
|
|
102
|
+
return value
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def sanitize_markdown_value(value: Any) -> str:
|
|
106
|
+
"""Return compact Markdown-safe text for a sanitized report field."""
|
|
107
|
+
|
|
108
|
+
sanitized = sanitize_report_payload(value)
|
|
109
|
+
if isinstance(sanitized, (Mapping, list, tuple)):
|
|
110
|
+
rendered = REDACTION_MARKER if sanitized == REDACTION_MARKER else str(sanitized)
|
|
111
|
+
else:
|
|
112
|
+
rendered = str(sanitized)
|
|
113
|
+
rendered = normalize_space(rendered).replace("|", r"\|")
|
|
114
|
+
if len(rendered) > MARKDOWN_VALUE_LIMIT:
|
|
115
|
+
return f"{rendered[: MARKDOWN_VALUE_LIMIT - 3]}..."
|
|
116
|
+
return rendered
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
__all__ = ["REDACTION_MARKER", "sanitize_markdown_value", "sanitize_report_payload"]
|