orga 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- orga/__init__.py +4 -0
- orga/cli/main.py +216 -0
- orga/discover/__init__.py +96 -0
- orga/fetch/__init__.py +4 -0
- orga/fetch/httpx_fetcher.py +119 -0
- orga/fetch/strategy.py +8 -0
- orga/governance/__init__.py +152 -0
- orga/governance/classification_aggregator.py +128 -0
- orga/merge/__init__.py +3 -0
- orga/merge/constants.py +50 -0
- orga/merge/processor.py +333 -0
- orga/model/__init__.py +11 -0
- orga/model/config.py +66 -0
- orga/model/document.py +45 -0
- orga/model/profile.py +88 -0
- orga/model/types.py +20 -0
- orga/parse/fields/__init__.py +7 -0
- orga/parse/fields/address_scorer.py +66 -0
- orga/parse/fields/classifier.py +287 -0
- orga/parse/fields/parsers.py +355 -0
- orga/parse/taxonomy.py +114 -0
- orga/pipeline/__init__.py +222 -0
- orga/registry/__init__.py +54 -0
- orga-0.1.0.dist-info/METADATA +187 -0
- orga-0.1.0.dist-info/RECORD +28 -0
- orga-0.1.0.dist-info/WHEEL +4 -0
- orga-0.1.0.dist-info/entry_points.txt +2 -0
- orga-0.1.0.dist-info/licenses/LICENSE +21 -0
orga/__init__.py
ADDED
orga/cli/main.py
ADDED
|
@@ -0,0 +1,216 @@
|
|
|
1
|
+
import typer
|
|
2
|
+
import asyncio
|
|
3
|
+
import json
|
|
4
|
+
import yaml
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Optional, List
|
|
7
|
+
from pydantic import ValidationError
|
|
8
|
+
|
|
9
|
+
from orga.pipeline import OrgaPipeline
|
|
10
|
+
from orga.model import OrgaConfig
|
|
11
|
+
from orga.registry import registry
|
|
12
|
+
|
|
13
|
+
# Ensure all default strategies are registered
|
|
14
|
+
import orga.fetch.httpx_fetcher
|
|
15
|
+
import orga.discover
|
|
16
|
+
import orga.parse.fields.parsers
|
|
17
|
+
import orga.parse.fields.classifier
|
|
18
|
+
import orga.merge.processor
|
|
19
|
+
|
|
20
|
+
app = typer.Typer(help="ORGA - Organization Profile Extractor CLI")
|
|
21
|
+
|
|
22
|
+
def load_config(config_path: Optional[Path]) -> OrgaConfig:
|
|
23
|
+
"""
|
|
24
|
+
Load OrgaConfig from a YAML or JSON file.
|
|
25
|
+
"""
|
|
26
|
+
if config_path is None:
|
|
27
|
+
return OrgaConfig()
|
|
28
|
+
|
|
29
|
+
if not config_path.exists():
|
|
30
|
+
typer.secho(f"Error: Config file {config_path} not found.", fg=typer.colors.RED, err=True)
|
|
31
|
+
raise typer.Exit(code=1)
|
|
32
|
+
|
|
33
|
+
try:
|
|
34
|
+
content = config_path.read_text(encoding="utf-8")
|
|
35
|
+
if config_path.suffix in [".yaml", ".yml"]:
|
|
36
|
+
data = yaml.safe_load(content)
|
|
37
|
+
else:
|
|
38
|
+
data = json.loads(content)
|
|
39
|
+
|
|
40
|
+
return OrgaConfig.model_validate(data or {})
|
|
41
|
+
except ValidationError as e:
|
|
42
|
+
typer.secho(f"Error: Invalid configuration in {config_path}:", fg=typer.colors.RED, err=True)
|
|
43
|
+
typer.echo(str(e), err=True)
|
|
44
|
+
raise typer.Exit(code=1)
|
|
45
|
+
except Exception as e:
|
|
46
|
+
typer.secho(f"Error loading config: {str(e)}", fg=typer.colors.RED, err=True)
|
|
47
|
+
raise typer.Exit(code=1)
|
|
48
|
+
|
|
49
|
+
@app.command()
|
|
50
|
+
def parse(
|
|
51
|
+
url: str = typer.Argument(..., help="The URL to parse"),
|
|
52
|
+
config: Optional[Path] = typer.Option(None, "--config", "-c", help="Path to configuration file (YAML/JSON)"),
|
|
53
|
+
output: Optional[Path] = typer.Option(None, "--output", "-o", help="Path to save the output JSON"),
|
|
54
|
+
pretty: bool = typer.Option(True, help="Pretty print JSON output"),
|
|
55
|
+
debug: bool = typer.Option(False, "--debug", "-d", help="Enable debug output (internal evidence, filtered links)")
|
|
56
|
+
):
|
|
57
|
+
"""
|
|
58
|
+
Parse an organization profile from a single URL.
|
|
59
|
+
"""
|
|
60
|
+
orga_config = load_config(config)
|
|
61
|
+
pipeline = OrgaPipeline(orga_config)
|
|
62
|
+
|
|
63
|
+
async def _run():
|
|
64
|
+
typer.echo(f"Fetching and parsing: {url} ...", err=True)
|
|
65
|
+
profile = await pipeline.run_from_url(url)
|
|
66
|
+
return profile
|
|
67
|
+
|
|
68
|
+
profile = asyncio.run(_run())
|
|
69
|
+
|
|
70
|
+
indent = 2 if pretty else None
|
|
71
|
+
|
|
72
|
+
# Configure export based on debug flag
|
|
73
|
+
exclude_fields = {}
|
|
74
|
+
if not debug:
|
|
75
|
+
# Hide internal/rejected evidence and debug info in standard mode
|
|
76
|
+
exclude_fields = {"internal_evidence", "debug_info"}
|
|
77
|
+
|
|
78
|
+
# We use model_dump then json.dumps because model_dump_json doesn't support complex exclude logic as easily with nested models
|
|
79
|
+
# (actually it does, but manual control is safer here).
|
|
80
|
+
# Wait, model_dump_json supports `exclude`.
|
|
81
|
+
# However, `internal_evidence` is also inside `Contact` and `Location`.
|
|
82
|
+
# We need to exclude it recursively. Pydantic excludes are usually recursive if field names match.
|
|
83
|
+
|
|
84
|
+
# Pydantic v2 recursive exclusion:
|
|
85
|
+
# exclude={"internal_evidence", "debug_info", "phones": {"__all__": {"internal_evidence"}}, ...}
|
|
86
|
+
# This is getting complicated.
|
|
87
|
+
# Simpler: If we defined `internal_evidence` in models, we can just pass `exclude` if it works recursively.
|
|
88
|
+
# Or, we update the models to use `Field(exclude=True)`? No, that's permanent.
|
|
89
|
+
|
|
90
|
+
# Let's try explicit recursive exclusion for top-level + common nested fields
|
|
91
|
+
if not debug:
|
|
92
|
+
exclude_set = {
|
|
93
|
+
"internal_evidence": True,
|
|
94
|
+
"debug_info": True,
|
|
95
|
+
"locations": {"__all__": {"internal_evidence": True}},
|
|
96
|
+
"phones": {"__all__": {"internal_evidence": True}},
|
|
97
|
+
"emails": {"__all__": {"internal_evidence": True}},
|
|
98
|
+
"social_links": {"__all__": {"internal_evidence": True}}
|
|
99
|
+
}
|
|
100
|
+
json_output = profile.model_dump_json(indent=indent, exclude=exclude_set)
|
|
101
|
+
else:
|
|
102
|
+
json_output = profile.model_dump_json(indent=indent)
|
|
103
|
+
|
|
104
|
+
if output:
|
|
105
|
+
output.write_text(json_output, encoding="utf-8")
|
|
106
|
+
typer.echo(f"Saved output to {output}", err=True)
|
|
107
|
+
else:
|
|
108
|
+
typer.echo(json_output)
|
|
109
|
+
|
|
110
|
+
@app.command()
|
|
111
|
+
def parse_batch(
|
|
112
|
+
input_file: Path = typer.Argument(..., help="Text file containing URLs (one per line)"),
|
|
113
|
+
config: Optional[Path] = typer.Option(None, "--config", "-c", help="Path to configuration file"),
|
|
114
|
+
output: Optional[Path] = typer.Option(None, "--output", "-o", help="Path to save the output JSONL file"),
|
|
115
|
+
pretty: bool = typer.Option(False, "--pretty", help="Pretty print JSON output (best for stdout debugging)"),
|
|
116
|
+
debug: bool = typer.Option(False, "--debug", "-d", help="Enable debug output in JSONL")
|
|
117
|
+
):
|
|
118
|
+
"""
|
|
119
|
+
Batch parse multiple URLs.
|
|
120
|
+
Default output is JSONL (one JSON per line).
|
|
121
|
+
Use --pretty for readable output on stdout.
|
|
122
|
+
"""
|
|
123
|
+
orga_config = load_config(config)
|
|
124
|
+
pipeline = OrgaPipeline(orga_config)
|
|
125
|
+
|
|
126
|
+
if not input_file.exists():
|
|
127
|
+
typer.secho(f"Error: Input file {input_file} not found.", fg=typer.colors.RED, err=True)
|
|
128
|
+
raise typer.Exit(code=1)
|
|
129
|
+
|
|
130
|
+
urls = [line.strip() for line in input_file.read_text().splitlines() if line.strip()]
|
|
131
|
+
|
|
132
|
+
async def _run_batch():
|
|
133
|
+
results = []
|
|
134
|
+
for i, url in enumerate(urls):
|
|
135
|
+
# Log progress to stderr so stdout remains clean for piping
|
|
136
|
+
typer.echo(f"[{i+1}/{len(urls)}] Processing {url}...", err=True)
|
|
137
|
+
try:
|
|
138
|
+
profile = await pipeline.run_from_url(url)
|
|
139
|
+
|
|
140
|
+
# Determine exclusion set
|
|
141
|
+
exclude_set = {}
|
|
142
|
+
if not debug:
|
|
143
|
+
exclude_set = {
|
|
144
|
+
"internal_evidence": True,
|
|
145
|
+
"debug_info": True,
|
|
146
|
+
"locations": {"__all__": {"internal_evidence": True}},
|
|
147
|
+
"phones": {"__all__": {"internal_evidence": True}},
|
|
148
|
+
"emails": {"__all__": {"internal_evidence": True}},
|
|
149
|
+
"social_links": {"__all__": {"internal_evidence": True}}
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
# Format output
|
|
153
|
+
indent = 2 if pretty and not output else None
|
|
154
|
+
json_str = profile.model_dump_json(indent=indent, exclude=exclude_set if not debug else None)
|
|
155
|
+
results.append(json_str)
|
|
156
|
+
|
|
157
|
+
except Exception as e:
|
|
158
|
+
typer.secho(f"Failed to process {url}: {str(e)}", fg=typer.colors.YELLOW, err=True)
|
|
159
|
+
return results
|
|
160
|
+
|
|
161
|
+
json_lines = asyncio.run(_run_batch())
|
|
162
|
+
|
|
163
|
+
if output:
|
|
164
|
+
# File output is always JSONL (no pretty print to ensure validity)
|
|
165
|
+
if pretty:
|
|
166
|
+
typer.secho("Warning: --pretty is ignored when writing to file to maintain valid JSONL format.", fg=typer.colors.YELLOW, err=True)
|
|
167
|
+
|
|
168
|
+
with output.open("w", encoding="utf-8") as f:
|
|
169
|
+
for line in json_lines:
|
|
170
|
+
# If we computed pretty strings above, we must flatten them for JSONL file
|
|
171
|
+
if pretty:
|
|
172
|
+
# Parse back and dump as single line
|
|
173
|
+
import json
|
|
174
|
+
line = json.dumps(json.loads(line))
|
|
175
|
+
f.write(line + "\n")
|
|
176
|
+
typer.secho(f"Successfully processed {len(json_lines)} URLs. Output: {output}", fg=typer.colors.GREEN, err=True)
|
|
177
|
+
else:
|
|
178
|
+
# Stdout output
|
|
179
|
+
for line in json_lines:
|
|
180
|
+
typer.echo(line)
|
|
181
|
+
|
|
182
|
+
@app.command()
|
|
183
|
+
def list_strategies():
|
|
184
|
+
"""
|
|
185
|
+
List all registered strategies.
|
|
186
|
+
"""
|
|
187
|
+
kinds = ["fetcher", "discoverer", "parser", "category_classifier", "merger"]
|
|
188
|
+
|
|
189
|
+
typer.echo("Registered ORGA Strategies:")
|
|
190
|
+
for kind in kinds:
|
|
191
|
+
strategies = registry.list(kind)
|
|
192
|
+
typer.echo(f"\n[{kind.upper()}]")
|
|
193
|
+
if not strategies:
|
|
194
|
+
typer.echo(" (None)")
|
|
195
|
+
for name in strategies:
|
|
196
|
+
impl = registry.get(kind, name)
|
|
197
|
+
typer.echo(f" - {name}: {impl.__name__}")
|
|
198
|
+
|
|
199
|
+
@app.command()
|
|
200
|
+
def validate_config(config_path: Path):
|
|
201
|
+
"""
|
|
202
|
+
Validate an ORGA configuration file.
|
|
203
|
+
"""
|
|
204
|
+
load_config(config_path)
|
|
205
|
+
typer.secho(f"Configuration is valid: {config_path}", fg=typer.colors.GREEN)
|
|
206
|
+
|
|
207
|
+
@app.command()
|
|
208
|
+
def inspect_signals(url: str):
|
|
209
|
+
"""
|
|
210
|
+
Inspect raw signals extracted from a URL (Debug mode).
|
|
211
|
+
"""
|
|
212
|
+
typer.echo(f"Inspecting signals for {url}...")
|
|
213
|
+
typer.echo("This feature is planned for a future milestone (M4).", err=True)
|
|
214
|
+
|
|
215
|
+
if __name__ == "__main__":
|
|
216
|
+
app()
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import List, Set, Dict
|
|
3
|
+
from urllib.parse import urljoin, urlparse
|
|
4
|
+
from selectolax.parser import HTMLParser
|
|
5
|
+
import re
|
|
6
|
+
from orga.model import Document
|
|
7
|
+
from orga.registry import registry
|
|
8
|
+
|
|
9
|
+
class DiscoveryStrategy(ABC):
|
|
10
|
+
"""
|
|
11
|
+
Base class for page discovery strategies.
|
|
12
|
+
Used to discover more high-value links from an initial document.
|
|
13
|
+
"""
|
|
14
|
+
@abstractmethod
|
|
15
|
+
def discover(self, entry_doc: Document) -> List[str]:
|
|
16
|
+
pass
|
|
17
|
+
|
|
18
|
+
class HeuristicDiscoveryStrategy(DiscoveryStrategy):
|
|
19
|
+
"""
|
|
20
|
+
Heuristic-based page discovery strategy.
|
|
21
|
+
Finds high-value links using keyword matching and domain filtering.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
DEFAULT_KEYWORDS = {
|
|
25
|
+
"contact": 10,
|
|
26
|
+
"contact-us": 10,
|
|
27
|
+
"about": 8,
|
|
28
|
+
"about-us": 8,
|
|
29
|
+
"location": 9,
|
|
30
|
+
"locations": 9,
|
|
31
|
+
"store": 7,
|
|
32
|
+
"clinic": 7,
|
|
33
|
+
"team": 5,
|
|
34
|
+
"find-us": 9,
|
|
35
|
+
"support": 6
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
def __init__(self, max_pages: int = 5, keywords: Dict[str, int] = None):
|
|
39
|
+
self.max_pages = max_pages
|
|
40
|
+
self.keywords = keywords or self.DEFAULT_KEYWORDS
|
|
41
|
+
|
|
42
|
+
def discover(self, entry_doc: Document) -> List[str]:
|
|
43
|
+
if not entry_doc.content:
|
|
44
|
+
return []
|
|
45
|
+
|
|
46
|
+
tree = HTMLParser(entry_doc.content)
|
|
47
|
+
entry_url_parsed = urlparse(entry_doc.url)
|
|
48
|
+
entry_domain = entry_url_parsed.netloc
|
|
49
|
+
|
|
50
|
+
candidates = []
|
|
51
|
+
seen_urls = {entry_doc.url.rstrip('/')}
|
|
52
|
+
|
|
53
|
+
for node in tree.css("a[href]"):
|
|
54
|
+
# href might be None if <a href> (boolean attribute style)
|
|
55
|
+
href_val = node.attributes.get("href")
|
|
56
|
+
href = (href_val or "").strip()
|
|
57
|
+
|
|
58
|
+
if not href or href.startswith(("#", "javascript:", "mailto:", "tel:")):
|
|
59
|
+
continue
|
|
60
|
+
|
|
61
|
+
abs_url = urljoin(entry_doc.url, href)
|
|
62
|
+
normalized_url = abs_url.rstrip('/')
|
|
63
|
+
|
|
64
|
+
if normalized_url in seen_urls:
|
|
65
|
+
continue
|
|
66
|
+
|
|
67
|
+
parsed_candidate = urlparse(abs_url)
|
|
68
|
+
if parsed_candidate.netloc != entry_domain:
|
|
69
|
+
continue
|
|
70
|
+
|
|
71
|
+
score = self._score_url(abs_url, node.text() or "")
|
|
72
|
+
if score > 0:
|
|
73
|
+
candidates.append((score, abs_url))
|
|
74
|
+
seen_urls.add(normalized_url)
|
|
75
|
+
|
|
76
|
+
candidates.sort(key=lambda x: x[0], reverse=True)
|
|
77
|
+
return [url for score, url in candidates[:self.max_pages]]
|
|
78
|
+
|
|
79
|
+
def _score_url(self, url: str, link_text: str) -> int:
|
|
80
|
+
score = 0
|
|
81
|
+
path = urlparse(url).path.lower()
|
|
82
|
+
link_text = link_text.lower()
|
|
83
|
+
|
|
84
|
+
for kw, weight in self.keywords.items():
|
|
85
|
+
if kw in path:
|
|
86
|
+
score += weight
|
|
87
|
+
if kw in link_text:
|
|
88
|
+
score += weight
|
|
89
|
+
|
|
90
|
+
if score == 0:
|
|
91
|
+
score = 1
|
|
92
|
+
|
|
93
|
+
return score
|
|
94
|
+
|
|
95
|
+
# Register strategy
|
|
96
|
+
registry.register("discoverer", "heuristic", HeuristicDiscoveryStrategy)
|
orga/fetch/__init__.py
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
import httpx
|
|
2
|
+
import asyncio
|
|
3
|
+
from typing import Optional, List, Dict
|
|
4
|
+
import tenacity
|
|
5
|
+
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
|
|
6
|
+
from aiolimiter import AsyncLimiter
|
|
7
|
+
from urllib.parse import urlparse
|
|
8
|
+
|
|
9
|
+
from orga.model import Document, OrgaConfig, Warning, WarningSeverity, SourceKind
|
|
10
|
+
from orga.registry import registry
|
|
11
|
+
|
|
12
|
+
class FetchError(Exception):
|
|
13
|
+
"""Custom exception for fetch operations."""
|
|
14
|
+
pass
|
|
15
|
+
|
|
16
|
+
class HttpxFetcher:
|
|
17
|
+
"""
|
|
18
|
+
Default fetch strategy using HTTPX.
|
|
19
|
+
Implements retries, timeout control, concurrency limits, and rate limiting.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
# Static pool of semaphores and limiters to share across fetcher instances
|
|
23
|
+
# if using the same configuration.
|
|
24
|
+
_semaphores: Dict[int, asyncio.Semaphore] = {}
|
|
25
|
+
_limiters: Dict[int, AsyncLimiter] = {}
|
|
26
|
+
|
|
27
|
+
def __init__(self, config: OrgaConfig):
|
|
28
|
+
self.config = config.fetch
|
|
29
|
+
|
|
30
|
+
# Initialize global concurrency control
|
|
31
|
+
# We use the config hash or id to share resources among identical configs
|
|
32
|
+
config_id = id(self.config)
|
|
33
|
+
if config_id not in self._semaphores:
|
|
34
|
+
self._semaphores[config_id] = asyncio.Semaphore(self.config.concurrency)
|
|
35
|
+
# Default rate limit: concurrency * 2 requests per second if not specified
|
|
36
|
+
# For now, let's use a conservative 5 req/s as default
|
|
37
|
+
self._limiters[config_id] = AsyncLimiter(10, 1) # 10 requests per second
|
|
38
|
+
|
|
39
|
+
self._semaphore = self._semaphores[config_id]
|
|
40
|
+
self._limiter = self._limiters[config_id]
|
|
41
|
+
|
|
42
|
+
async def fetch(self, url: str) -> Document:
|
|
43
|
+
"""
|
|
44
|
+
Fetch a single URL asynchronously with concurrency and rate limit control.
|
|
45
|
+
"""
|
|
46
|
+
headers = {"User-Agent": self.config.user_agent}
|
|
47
|
+
|
|
48
|
+
# Apply rate limiting and concurrency control
|
|
49
|
+
async with self._limiter:
|
|
50
|
+
async with self._semaphore:
|
|
51
|
+
try:
|
|
52
|
+
return await self._fetch_with_retry(url, headers)
|
|
53
|
+
except tenacity.RetryError as e:
|
|
54
|
+
last_exception = e.last_attempt.exception()
|
|
55
|
+
message = f"Connection timed out (Max retries reached): {str(last_exception)}"
|
|
56
|
+
return self._create_error_document(url, "FETCH_TIMEOUT", message, WarningSeverity.ERROR)
|
|
57
|
+
except httpx.TimeoutException:
|
|
58
|
+
return self._create_error_document(url, "FETCH_TIMEOUT", "Connection timed out", WarningSeverity.ERROR)
|
|
59
|
+
except Exception as e:
|
|
60
|
+
return self._create_error_document(url, "FETCH_ERROR", str(e), WarningSeverity.ERROR)
|
|
61
|
+
|
|
62
|
+
@retry(
|
|
63
|
+
stop=stop_after_attempt(3),
|
|
64
|
+
wait=wait_exponential(multiplier=1, min=2, max=10),
|
|
65
|
+
retry=retry_if_exception_type((httpx.NetworkError, httpx.RemoteProtocolError, httpx.ReadTimeout, httpx.HTTPStatusError))
|
|
66
|
+
)
|
|
67
|
+
async def _fetch_with_retry(self, url: str, headers: Dict[str, str]) -> Document:
|
|
68
|
+
# Note: AsyncClient is instantiated inside the retry block to ensure fresh connection pool if needed,
|
|
69
|
+
# but for high performance we might want to share one client.
|
|
70
|
+
# Design Doc 7.2 suggests default fetcher should be a 'convenience layer'.
|
|
71
|
+
async with httpx.AsyncClient(timeout=self.config.timeout, follow_redirects=True) as client:
|
|
72
|
+
response = await client.get(url, headers=headers)
|
|
73
|
+
|
|
74
|
+
# Retry on 5xx errors
|
|
75
|
+
if response.status_code >= 500:
|
|
76
|
+
response.raise_for_status()
|
|
77
|
+
|
|
78
|
+
# Handle 404 gracefully without retry
|
|
79
|
+
if response.status_code == 404:
|
|
80
|
+
return self._create_document_from_response(response, warnings=[
|
|
81
|
+
Warning(code="HTTP_404", message="Page not found", severity=WarningSeverity.WARNING)
|
|
82
|
+
])
|
|
83
|
+
|
|
84
|
+
# Check for non-HTML content
|
|
85
|
+
content_type = response.headers.get("content-type", "").lower()
|
|
86
|
+
warnings = []
|
|
87
|
+
if "text/html" not in content_type:
|
|
88
|
+
warnings.append(Warning(
|
|
89
|
+
code="NON_HTML_CONTENT",
|
|
90
|
+
message=f"Content-Type is {content_type}, expected text/html",
|
|
91
|
+
severity=WarningSeverity.WARNING
|
|
92
|
+
))
|
|
93
|
+
|
|
94
|
+
return self._create_document_from_response(response, warnings=warnings)
|
|
95
|
+
|
|
96
|
+
def _create_document_from_response(self, response: httpx.Response, warnings: List[Warning] = None) -> Document:
|
|
97
|
+
return Document(
|
|
98
|
+
url=str(response.url),
|
|
99
|
+
content=response.text,
|
|
100
|
+
content_type=response.headers.get("content-type", "application/octet-stream"),
|
|
101
|
+
status_code=response.status_code,
|
|
102
|
+
headers_summary={k: v for k, v in response.headers.items() if k.lower() in ["content-type", "server", "date"]},
|
|
103
|
+
fetch_warnings=warnings or [],
|
|
104
|
+
source_kind=SourceKind.HTTP_FETCH
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
def _create_error_document(self, url: str, code: str, message: str, severity: WarningSeverity) -> Document:
|
|
108
|
+
return Document(
|
|
109
|
+
url=url,
|
|
110
|
+
content="[FETCH FAILED]",
|
|
111
|
+
status_code=0,
|
|
112
|
+
fetch_warnings=[
|
|
113
|
+
Warning(code=code, message=message, severity=severity, source_url=url)
|
|
114
|
+
],
|
|
115
|
+
source_kind=SourceKind.HTTP_FETCH
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
# Register strategy
|
|
119
|
+
registry.register("fetcher", "httpx", HttpxFetcher)
|
orga/fetch/strategy.py
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
from typing import List, Dict, Any, Optional
|
|
2
|
+
from orga.model import (
|
|
3
|
+
OrganizationProfile, Evidence, Warning,
|
|
4
|
+
WarningSeverity, Confidence, Location, ContactKind
|
|
5
|
+
)
|
|
6
|
+
|
|
7
|
+
# Default Weights and Reliabilities for Evidence Types
|
|
8
|
+
# Defined in Design Doc 12.2
|
|
9
|
+
SOURCE_TYPE_METRICS = {
|
|
10
|
+
"jsonld_address": {"weight": 1.0, "reliability": 1.0},
|
|
11
|
+
"jsonld_org_name": {"weight": 1.0, "reliability": 1.0},
|
|
12
|
+
"html_attr_tel": {"weight": 0.9, "reliability": 0.95},
|
|
13
|
+
"html_attr_mailto": {"weight": 0.9, "reliability": 0.95},
|
|
14
|
+
"html_attr_social": {"weight": 0.8, "reliability": 0.9},
|
|
15
|
+
"regex_text_validated": {"weight": 0.5, "reliability": 0.7},
|
|
16
|
+
"parsed_address": {"weight": 0.7, "reliability": 0.8},
|
|
17
|
+
"heuristic_text": {"weight": 0.3, "reliability": 0.5},
|
|
18
|
+
"text_matcher_validated": {"weight": 0.6, "reliability": 0.7},
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
class ScoringEngine:
|
|
22
|
+
"""
|
|
23
|
+
Implements mathematical scoring formulas from Design Document Section 12.3.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
def calculate_field_score(self, evidences: List[Evidence]) -> float:
|
|
27
|
+
"""
|
|
28
|
+
Calculate field-level score using weighted average formula:
|
|
29
|
+
score = sum(w_i * r_i) / sum(w_i)
|
|
30
|
+
"""
|
|
31
|
+
if not evidences:
|
|
32
|
+
return 0.0
|
|
33
|
+
|
|
34
|
+
total_weighted_reliability = 0.0
|
|
35
|
+
total_weight = 0.0
|
|
36
|
+
|
|
37
|
+
for ev in evidences:
|
|
38
|
+
metrics = SOURCE_TYPE_METRICS.get(ev.source_type, {"weight": 0.5, "reliability": 0.5})
|
|
39
|
+
w = metrics["weight"]
|
|
40
|
+
r = ev.confidence_score if ev.confidence_score > 0 else metrics["reliability"]
|
|
41
|
+
|
|
42
|
+
total_weighted_reliability += w * r
|
|
43
|
+
total_weight += w
|
|
44
|
+
|
|
45
|
+
return round(total_weighted_reliability / total_weight, 2) if total_weight > 0 else 0.0
|
|
46
|
+
|
|
47
|
+
def calculate_profile_score(self, profile: OrganizationProfile) -> float:
|
|
48
|
+
"""
|
|
49
|
+
Calculate profile-level score with completeness penalty:
|
|
50
|
+
score_profile = (sum(alpha_f * score_f)) * beta_completeness
|
|
51
|
+
"""
|
|
52
|
+
# alpha_f: relative importance of fields
|
|
53
|
+
weights = {
|
|
54
|
+
"name": 0.3,
|
|
55
|
+
"locations": 0.4,
|
|
56
|
+
"contacts": 0.3 # phones + emails + socials
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
sum_weighted_scores = 0.0
|
|
60
|
+
|
|
61
|
+
# 1. Name score
|
|
62
|
+
if profile.name:
|
|
63
|
+
sum_weighted_scores += weights["name"] * 1.0
|
|
64
|
+
|
|
65
|
+
# 2. Locations score
|
|
66
|
+
if profile.locations:
|
|
67
|
+
loc_scores = [loc.confidence for loc in profile.locations]
|
|
68
|
+
avg_loc_score = max(loc_scores) if loc_scores else 0.0 # Use max for profile strength
|
|
69
|
+
sum_weighted_scores += weights["locations"] * avg_loc_score
|
|
70
|
+
|
|
71
|
+
# 3. Contacts score
|
|
72
|
+
all_contacts = profile.phones + profile.emails + profile.social_links
|
|
73
|
+
if all_contacts:
|
|
74
|
+
contact_scores = [c.confidence for c in all_contacts]
|
|
75
|
+
avg_contact_score = max(contact_scores) if contact_scores else 0.0
|
|
76
|
+
sum_weighted_scores += weights["contacts"] * avg_contact_score
|
|
77
|
+
|
|
78
|
+
# beta_completeness: Penalty for missing critical fields (Design Doc 12.3.2)
|
|
79
|
+
beta = 1.0
|
|
80
|
+
if not profile.locations:
|
|
81
|
+
beta *= 0.7
|
|
82
|
+
if not profile.phones and not profile.emails:
|
|
83
|
+
beta *= 0.8
|
|
84
|
+
if not profile.categories:
|
|
85
|
+
beta *= 0.9
|
|
86
|
+
|
|
87
|
+
return round(sum_weighted_scores * beta, 2)
|
|
88
|
+
|
|
89
|
+
class WarningRegistry:
|
|
90
|
+
"""
|
|
91
|
+
Standardized Warning Codes implementation (Design Doc 12.4.1).
|
|
92
|
+
"""
|
|
93
|
+
|
|
94
|
+
def scan_for_warnings(self, profile: OrganizationProfile) -> List[Warning]:
|
|
95
|
+
"""
|
|
96
|
+
Scan a profile and generate standardized warnings according to the contract.
|
|
97
|
+
"""
|
|
98
|
+
warnings = []
|
|
99
|
+
|
|
100
|
+
# EMPTY_PROFILE
|
|
101
|
+
if not profile.name and not profile.locations and not profile.phones and not profile.emails:
|
|
102
|
+
warnings.append(Warning(
|
|
103
|
+
code="EMPTY_PROFILE",
|
|
104
|
+
message="No significant profile data extracted",
|
|
105
|
+
severity=WarningSeverity.ERROR
|
|
106
|
+
))
|
|
107
|
+
return warnings
|
|
108
|
+
|
|
109
|
+
# NO_LOCATION_FOUND
|
|
110
|
+
if not profile.locations:
|
|
111
|
+
warnings.append(Warning(
|
|
112
|
+
code="NO_LOCATION_FOUND",
|
|
113
|
+
message="No physical address found in documentation",
|
|
114
|
+
severity=WarningSeverity.WARNING
|
|
115
|
+
))
|
|
116
|
+
else:
|
|
117
|
+
# ADDRESS_PARTIALLY_PARSED
|
|
118
|
+
for loc in profile.locations:
|
|
119
|
+
if loc.address.raw and not (loc.address.street or loc.address.city):
|
|
120
|
+
warnings.append(Warning(
|
|
121
|
+
code="ADDRESS_PARTIALLY_PARSED",
|
|
122
|
+
message="Address found but only raw string was extracted",
|
|
123
|
+
severity=WarningSeverity.WARNING,
|
|
124
|
+
related_field="locations"
|
|
125
|
+
))
|
|
126
|
+
break
|
|
127
|
+
|
|
128
|
+
# NO_CONTACT_FOUND
|
|
129
|
+
if not profile.phones and not profile.emails and not profile.social_links:
|
|
130
|
+
warnings.append(Warning(
|
|
131
|
+
code="NO_CONTACT_FOUND",
|
|
132
|
+
message="No telephone, email or social media links found",
|
|
133
|
+
severity=WarningSeverity.WARNING
|
|
134
|
+
))
|
|
135
|
+
|
|
136
|
+
# CLASSIFICATION_LOW_CONFIDENCE (Aligned with Design Doc 12.4.1)
|
|
137
|
+
if not profile.categories:
|
|
138
|
+
warnings.append(Warning(
|
|
139
|
+
code="CLASSIFICATION_LOW_CONFIDENCE",
|
|
140
|
+
message="Business classification confidence is low or no categories found",
|
|
141
|
+
severity=WarningSeverity.WARNING
|
|
142
|
+
))
|
|
143
|
+
|
|
144
|
+
# LOW_CONFIDENCE_FIELD
|
|
145
|
+
if profile.confidence and profile.confidence.overall_score < 0.4:
|
|
146
|
+
warnings.append(Warning(
|
|
147
|
+
code="LOW_CONFIDENCE_FIELD",
|
|
148
|
+
message=f"Overall profile confidence is low ({profile.confidence.overall_score})",
|
|
149
|
+
severity=WarningSeverity.WARNING
|
|
150
|
+
))
|
|
151
|
+
|
|
152
|
+
return warnings
|