orga 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
orga/__init__.py ADDED
@@ -0,0 +1,4 @@
1
+ from .model import OrganizationProfile, Document, OrgaConfig
2
+ from .pipeline import OrgaPipeline
3
+
4
+ __version__ = "0.1.0"
orga/cli/main.py ADDED
@@ -0,0 +1,216 @@
1
+ import typer
2
+ import asyncio
3
+ import json
4
+ import yaml
5
+ from pathlib import Path
6
+ from typing import Optional, List
7
+ from pydantic import ValidationError
8
+
9
+ from orga.pipeline import OrgaPipeline
10
+ from orga.model import OrgaConfig
11
+ from orga.registry import registry
12
+
13
+ # Ensure all default strategies are registered
14
+ import orga.fetch.httpx_fetcher
15
+ import orga.discover
16
+ import orga.parse.fields.parsers
17
+ import orga.parse.fields.classifier
18
+ import orga.merge.processor
19
+
20
+ app = typer.Typer(help="ORGA - Organization Profile Extractor CLI")
21
+
22
+ def load_config(config_path: Optional[Path]) -> OrgaConfig:
23
+ """
24
+ Load OrgaConfig from a YAML or JSON file.
25
+ """
26
+ if config_path is None:
27
+ return OrgaConfig()
28
+
29
+ if not config_path.exists():
30
+ typer.secho(f"Error: Config file {config_path} not found.", fg=typer.colors.RED, err=True)
31
+ raise typer.Exit(code=1)
32
+
33
+ try:
34
+ content = config_path.read_text(encoding="utf-8")
35
+ if config_path.suffix in [".yaml", ".yml"]:
36
+ data = yaml.safe_load(content)
37
+ else:
38
+ data = json.loads(content)
39
+
40
+ return OrgaConfig.model_validate(data or {})
41
+ except ValidationError as e:
42
+ typer.secho(f"Error: Invalid configuration in {config_path}:", fg=typer.colors.RED, err=True)
43
+ typer.echo(str(e), err=True)
44
+ raise typer.Exit(code=1)
45
+ except Exception as e:
46
+ typer.secho(f"Error loading config: {str(e)}", fg=typer.colors.RED, err=True)
47
+ raise typer.Exit(code=1)
48
+
49
+ @app.command()
50
+ def parse(
51
+ url: str = typer.Argument(..., help="The URL to parse"),
52
+ config: Optional[Path] = typer.Option(None, "--config", "-c", help="Path to configuration file (YAML/JSON)"),
53
+ output: Optional[Path] = typer.Option(None, "--output", "-o", help="Path to save the output JSON"),
54
+ pretty: bool = typer.Option(True, help="Pretty print JSON output"),
55
+ debug: bool = typer.Option(False, "--debug", "-d", help="Enable debug output (internal evidence, filtered links)")
56
+ ):
57
+ """
58
+ Parse an organization profile from a single URL.
59
+ """
60
+ orga_config = load_config(config)
61
+ pipeline = OrgaPipeline(orga_config)
62
+
63
+ async def _run():
64
+ typer.echo(f"Fetching and parsing: {url} ...", err=True)
65
+ profile = await pipeline.run_from_url(url)
66
+ return profile
67
+
68
+ profile = asyncio.run(_run())
69
+
70
+ indent = 2 if pretty else None
71
+
72
+ # Configure export based on debug flag
73
+ exclude_fields = {}
74
+ if not debug:
75
+ # Hide internal/rejected evidence and debug info in standard mode
76
+ exclude_fields = {"internal_evidence", "debug_info"}
77
+
78
+ # We use model_dump then json.dumps because model_dump_json doesn't support complex exclude logic as easily with nested models
79
+ # (actually it does, but manual control is safer here).
80
+ # Wait, model_dump_json supports `exclude`.
81
+ # However, `internal_evidence` is also inside `Contact` and `Location`.
82
+ # We need to exclude it recursively. Pydantic excludes are usually recursive if field names match.
83
+
84
+ # Pydantic v2 recursive exclusion:
85
+ # exclude={"internal_evidence", "debug_info", "phones": {"__all__": {"internal_evidence"}}, ...}
86
+ # This is getting complicated.
87
+ # Simpler: If we defined `internal_evidence` in models, we can just pass `exclude` if it works recursively.
88
+ # Or, we update the models to use `Field(exclude=True)`? No, that's permanent.
89
+
90
+ # Let's try explicit recursive exclusion for top-level + common nested fields
91
+ if not debug:
92
+ exclude_set = {
93
+ "internal_evidence": True,
94
+ "debug_info": True,
95
+ "locations": {"__all__": {"internal_evidence": True}},
96
+ "phones": {"__all__": {"internal_evidence": True}},
97
+ "emails": {"__all__": {"internal_evidence": True}},
98
+ "social_links": {"__all__": {"internal_evidence": True}}
99
+ }
100
+ json_output = profile.model_dump_json(indent=indent, exclude=exclude_set)
101
+ else:
102
+ json_output = profile.model_dump_json(indent=indent)
103
+
104
+ if output:
105
+ output.write_text(json_output, encoding="utf-8")
106
+ typer.echo(f"Saved output to {output}", err=True)
107
+ else:
108
+ typer.echo(json_output)
109
+
110
+ @app.command()
111
+ def parse_batch(
112
+ input_file: Path = typer.Argument(..., help="Text file containing URLs (one per line)"),
113
+ config: Optional[Path] = typer.Option(None, "--config", "-c", help="Path to configuration file"),
114
+ output: Optional[Path] = typer.Option(None, "--output", "-o", help="Path to save the output JSONL file"),
115
+ pretty: bool = typer.Option(False, "--pretty", help="Pretty print JSON output (best for stdout debugging)"),
116
+ debug: bool = typer.Option(False, "--debug", "-d", help="Enable debug output in JSONL")
117
+ ):
118
+ """
119
+ Batch parse multiple URLs.
120
+ Default output is JSONL (one JSON per line).
121
+ Use --pretty for readable output on stdout.
122
+ """
123
+ orga_config = load_config(config)
124
+ pipeline = OrgaPipeline(orga_config)
125
+
126
+ if not input_file.exists():
127
+ typer.secho(f"Error: Input file {input_file} not found.", fg=typer.colors.RED, err=True)
128
+ raise typer.Exit(code=1)
129
+
130
+ urls = [line.strip() for line in input_file.read_text().splitlines() if line.strip()]
131
+
132
+ async def _run_batch():
133
+ results = []
134
+ for i, url in enumerate(urls):
135
+ # Log progress to stderr so stdout remains clean for piping
136
+ typer.echo(f"[{i+1}/{len(urls)}] Processing {url}...", err=True)
137
+ try:
138
+ profile = await pipeline.run_from_url(url)
139
+
140
+ # Determine exclusion set
141
+ exclude_set = {}
142
+ if not debug:
143
+ exclude_set = {
144
+ "internal_evidence": True,
145
+ "debug_info": True,
146
+ "locations": {"__all__": {"internal_evidence": True}},
147
+ "phones": {"__all__": {"internal_evidence": True}},
148
+ "emails": {"__all__": {"internal_evidence": True}},
149
+ "social_links": {"__all__": {"internal_evidence": True}}
150
+ }
151
+
152
+ # Format output
153
+ indent = 2 if pretty and not output else None
154
+ json_str = profile.model_dump_json(indent=indent, exclude=exclude_set if not debug else None)
155
+ results.append(json_str)
156
+
157
+ except Exception as e:
158
+ typer.secho(f"Failed to process {url}: {str(e)}", fg=typer.colors.YELLOW, err=True)
159
+ return results
160
+
161
+ json_lines = asyncio.run(_run_batch())
162
+
163
+ if output:
164
+ # File output is always JSONL (no pretty print to ensure validity)
165
+ if pretty:
166
+ typer.secho("Warning: --pretty is ignored when writing to file to maintain valid JSONL format.", fg=typer.colors.YELLOW, err=True)
167
+
168
+ with output.open("w", encoding="utf-8") as f:
169
+ for line in json_lines:
170
+ # If we computed pretty strings above, we must flatten them for JSONL file
171
+ if pretty:
172
+ # Parse back and dump as single line
173
+ import json
174
+ line = json.dumps(json.loads(line))
175
+ f.write(line + "\n")
176
+ typer.secho(f"Successfully processed {len(json_lines)} URLs. Output: {output}", fg=typer.colors.GREEN, err=True)
177
+ else:
178
+ # Stdout output
179
+ for line in json_lines:
180
+ typer.echo(line)
181
+
182
+ @app.command()
183
+ def list_strategies():
184
+ """
185
+ List all registered strategies.
186
+ """
187
+ kinds = ["fetcher", "discoverer", "parser", "category_classifier", "merger"]
188
+
189
+ typer.echo("Registered ORGA Strategies:")
190
+ for kind in kinds:
191
+ strategies = registry.list(kind)
192
+ typer.echo(f"\n[{kind.upper()}]")
193
+ if not strategies:
194
+ typer.echo(" (None)")
195
+ for name in strategies:
196
+ impl = registry.get(kind, name)
197
+ typer.echo(f" - {name}: {impl.__name__}")
198
+
199
+ @app.command()
200
+ def validate_config(config_path: Path):
201
+ """
202
+ Validate an ORGA configuration file.
203
+ """
204
+ load_config(config_path)
205
+ typer.secho(f"Configuration is valid: {config_path}", fg=typer.colors.GREEN)
206
+
207
+ @app.command()
208
+ def inspect_signals(url: str):
209
+ """
210
+ Inspect raw signals extracted from a URL (Debug mode).
211
+ """
212
+ typer.echo(f"Inspecting signals for {url}...")
213
+ typer.echo("This feature is planned for a future milestone (M4).", err=True)
214
+
215
+ if __name__ == "__main__":
216
+ app()
@@ -0,0 +1,96 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import List, Set, Dict
3
+ from urllib.parse import urljoin, urlparse
4
+ from selectolax.parser import HTMLParser
5
+ import re
6
+ from orga.model import Document
7
+ from orga.registry import registry
8
+
9
+ class DiscoveryStrategy(ABC):
10
+ """
11
+ Base class for page discovery strategies.
12
+ Used to discover more high-value links from an initial document.
13
+ """
14
+ @abstractmethod
15
+ def discover(self, entry_doc: Document) -> List[str]:
16
+ pass
17
+
18
+ class HeuristicDiscoveryStrategy(DiscoveryStrategy):
19
+ """
20
+ Heuristic-based page discovery strategy.
21
+ Finds high-value links using keyword matching and domain filtering.
22
+ """
23
+
24
+ DEFAULT_KEYWORDS = {
25
+ "contact": 10,
26
+ "contact-us": 10,
27
+ "about": 8,
28
+ "about-us": 8,
29
+ "location": 9,
30
+ "locations": 9,
31
+ "store": 7,
32
+ "clinic": 7,
33
+ "team": 5,
34
+ "find-us": 9,
35
+ "support": 6
36
+ }
37
+
38
+ def __init__(self, max_pages: int = 5, keywords: Dict[str, int] = None):
39
+ self.max_pages = max_pages
40
+ self.keywords = keywords or self.DEFAULT_KEYWORDS
41
+
42
+ def discover(self, entry_doc: Document) -> List[str]:
43
+ if not entry_doc.content:
44
+ return []
45
+
46
+ tree = HTMLParser(entry_doc.content)
47
+ entry_url_parsed = urlparse(entry_doc.url)
48
+ entry_domain = entry_url_parsed.netloc
49
+
50
+ candidates = []
51
+ seen_urls = {entry_doc.url.rstrip('/')}
52
+
53
+ for node in tree.css("a[href]"):
54
+ # href might be None if <a href> (boolean attribute style)
55
+ href_val = node.attributes.get("href")
56
+ href = (href_val or "").strip()
57
+
58
+ if not href or href.startswith(("#", "javascript:", "mailto:", "tel:")):
59
+ continue
60
+
61
+ abs_url = urljoin(entry_doc.url, href)
62
+ normalized_url = abs_url.rstrip('/')
63
+
64
+ if normalized_url in seen_urls:
65
+ continue
66
+
67
+ parsed_candidate = urlparse(abs_url)
68
+ if parsed_candidate.netloc != entry_domain:
69
+ continue
70
+
71
+ score = self._score_url(abs_url, node.text() or "")
72
+ if score > 0:
73
+ candidates.append((score, abs_url))
74
+ seen_urls.add(normalized_url)
75
+
76
+ candidates.sort(key=lambda x: x[0], reverse=True)
77
+ return [url for score, url in candidates[:self.max_pages]]
78
+
79
+ def _score_url(self, url: str, link_text: str) -> int:
80
+ score = 0
81
+ path = urlparse(url).path.lower()
82
+ link_text = link_text.lower()
83
+
84
+ for kw, weight in self.keywords.items():
85
+ if kw in path:
86
+ score += weight
87
+ if kw in link_text:
88
+ score += weight
89
+
90
+ if score == 0:
91
+ score = 1
92
+
93
+ return score
94
+
95
+ # Register strategy
96
+ registry.register("discoverer", "heuristic", HeuristicDiscoveryStrategy)
orga/fetch/__init__.py ADDED
@@ -0,0 +1,4 @@
1
+ from orga.fetch.httpx_fetcher import HttpxFetcher
2
+ from orga.fetch.strategy import FetchStrategy
3
+
4
+ __all__ = ["HttpxFetcher", "FetchStrategy"]
@@ -0,0 +1,119 @@
1
+ import httpx
2
+ import asyncio
3
+ from typing import Optional, List, Dict
4
+ import tenacity
5
+ from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
6
+ from aiolimiter import AsyncLimiter
7
+ from urllib.parse import urlparse
8
+
9
+ from orga.model import Document, OrgaConfig, Warning, WarningSeverity, SourceKind
10
+ from orga.registry import registry
11
+
12
+ class FetchError(Exception):
13
+ """Custom exception for fetch operations."""
14
+ pass
15
+
16
+ class HttpxFetcher:
17
+ """
18
+ Default fetch strategy using HTTPX.
19
+ Implements retries, timeout control, concurrency limits, and rate limiting.
20
+ """
21
+
22
+ # Static pool of semaphores and limiters to share across fetcher instances
23
+ # if using the same configuration.
24
+ _semaphores: Dict[int, asyncio.Semaphore] = {}
25
+ _limiters: Dict[int, AsyncLimiter] = {}
26
+
27
+ def __init__(self, config: OrgaConfig):
28
+ self.config = config.fetch
29
+
30
+ # Initialize global concurrency control
31
+ # We use the config hash or id to share resources among identical configs
32
+ config_id = id(self.config)
33
+ if config_id not in self._semaphores:
34
+ self._semaphores[config_id] = asyncio.Semaphore(self.config.concurrency)
35
+ # Default rate limit: concurrency * 2 requests per second if not specified
36
+ # For now, let's use a conservative 5 req/s as default
37
+ self._limiters[config_id] = AsyncLimiter(10, 1) # 10 requests per second
38
+
39
+ self._semaphore = self._semaphores[config_id]
40
+ self._limiter = self._limiters[config_id]
41
+
42
+ async def fetch(self, url: str) -> Document:
43
+ """
44
+ Fetch a single URL asynchronously with concurrency and rate limit control.
45
+ """
46
+ headers = {"User-Agent": self.config.user_agent}
47
+
48
+ # Apply rate limiting and concurrency control
49
+ async with self._limiter:
50
+ async with self._semaphore:
51
+ try:
52
+ return await self._fetch_with_retry(url, headers)
53
+ except tenacity.RetryError as e:
54
+ last_exception = e.last_attempt.exception()
55
+ message = f"Connection timed out (Max retries reached): {str(last_exception)}"
56
+ return self._create_error_document(url, "FETCH_TIMEOUT", message, WarningSeverity.ERROR)
57
+ except httpx.TimeoutException:
58
+ return self._create_error_document(url, "FETCH_TIMEOUT", "Connection timed out", WarningSeverity.ERROR)
59
+ except Exception as e:
60
+ return self._create_error_document(url, "FETCH_ERROR", str(e), WarningSeverity.ERROR)
61
+
62
+ @retry(
63
+ stop=stop_after_attempt(3),
64
+ wait=wait_exponential(multiplier=1, min=2, max=10),
65
+ retry=retry_if_exception_type((httpx.NetworkError, httpx.RemoteProtocolError, httpx.ReadTimeout, httpx.HTTPStatusError))
66
+ )
67
+ async def _fetch_with_retry(self, url: str, headers: Dict[str, str]) -> Document:
68
+ # Note: AsyncClient is instantiated inside the retry block to ensure fresh connection pool if needed,
69
+ # but for high performance we might want to share one client.
70
+ # Design Doc 7.2 suggests default fetcher should be a 'convenience layer'.
71
+ async with httpx.AsyncClient(timeout=self.config.timeout, follow_redirects=True) as client:
72
+ response = await client.get(url, headers=headers)
73
+
74
+ # Retry on 5xx errors
75
+ if response.status_code >= 500:
76
+ response.raise_for_status()
77
+
78
+ # Handle 404 gracefully without retry
79
+ if response.status_code == 404:
80
+ return self._create_document_from_response(response, warnings=[
81
+ Warning(code="HTTP_404", message="Page not found", severity=WarningSeverity.WARNING)
82
+ ])
83
+
84
+ # Check for non-HTML content
85
+ content_type = response.headers.get("content-type", "").lower()
86
+ warnings = []
87
+ if "text/html" not in content_type:
88
+ warnings.append(Warning(
89
+ code="NON_HTML_CONTENT",
90
+ message=f"Content-Type is {content_type}, expected text/html",
91
+ severity=WarningSeverity.WARNING
92
+ ))
93
+
94
+ return self._create_document_from_response(response, warnings=warnings)
95
+
96
+ def _create_document_from_response(self, response: httpx.Response, warnings: List[Warning] = None) -> Document:
97
+ return Document(
98
+ url=str(response.url),
99
+ content=response.text,
100
+ content_type=response.headers.get("content-type", "application/octet-stream"),
101
+ status_code=response.status_code,
102
+ headers_summary={k: v for k, v in response.headers.items() if k.lower() in ["content-type", "server", "date"]},
103
+ fetch_warnings=warnings or [],
104
+ source_kind=SourceKind.HTTP_FETCH
105
+ )
106
+
107
+ def _create_error_document(self, url: str, code: str, message: str, severity: WarningSeverity) -> Document:
108
+ return Document(
109
+ url=url,
110
+ content="[FETCH FAILED]",
111
+ status_code=0,
112
+ fetch_warnings=[
113
+ Warning(code=code, message=message, severity=severity, source_url=url)
114
+ ],
115
+ source_kind=SourceKind.HTTP_FETCH
116
+ )
117
+
118
+ # Register strategy
119
+ registry.register("fetcher", "httpx", HttpxFetcher)
orga/fetch/strategy.py ADDED
@@ -0,0 +1,8 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import List
3
+ from orga.model import Document
4
+
5
+ class FetchStrategy(ABC):
6
+ @abstractmethod
7
+ async def fetch(self, url: str) -> Document:
8
+ pass
@@ -0,0 +1,152 @@
1
+ from typing import List, Dict, Any, Optional
2
+ from orga.model import (
3
+ OrganizationProfile, Evidence, Warning,
4
+ WarningSeverity, Confidence, Location, ContactKind
5
+ )
6
+
7
+ # Default Weights and Reliabilities for Evidence Types
8
+ # Defined in Design Doc 12.2
9
+ SOURCE_TYPE_METRICS = {
10
+ "jsonld_address": {"weight": 1.0, "reliability": 1.0},
11
+ "jsonld_org_name": {"weight": 1.0, "reliability": 1.0},
12
+ "html_attr_tel": {"weight": 0.9, "reliability": 0.95},
13
+ "html_attr_mailto": {"weight": 0.9, "reliability": 0.95},
14
+ "html_attr_social": {"weight": 0.8, "reliability": 0.9},
15
+ "regex_text_validated": {"weight": 0.5, "reliability": 0.7},
16
+ "parsed_address": {"weight": 0.7, "reliability": 0.8},
17
+ "heuristic_text": {"weight": 0.3, "reliability": 0.5},
18
+ "text_matcher_validated": {"weight": 0.6, "reliability": 0.7},
19
+ }
20
+
21
+ class ScoringEngine:
22
+ """
23
+ Implements mathematical scoring formulas from Design Document Section 12.3.
24
+ """
25
+
26
+ def calculate_field_score(self, evidences: List[Evidence]) -> float:
27
+ """
28
+ Calculate field-level score using weighted average formula:
29
+ score = sum(w_i * r_i) / sum(w_i)
30
+ """
31
+ if not evidences:
32
+ return 0.0
33
+
34
+ total_weighted_reliability = 0.0
35
+ total_weight = 0.0
36
+
37
+ for ev in evidences:
38
+ metrics = SOURCE_TYPE_METRICS.get(ev.source_type, {"weight": 0.5, "reliability": 0.5})
39
+ w = metrics["weight"]
40
+ r = ev.confidence_score if ev.confidence_score > 0 else metrics["reliability"]
41
+
42
+ total_weighted_reliability += w * r
43
+ total_weight += w
44
+
45
+ return round(total_weighted_reliability / total_weight, 2) if total_weight > 0 else 0.0
46
+
47
+ def calculate_profile_score(self, profile: OrganizationProfile) -> float:
48
+ """
49
+ Calculate profile-level score with completeness penalty:
50
+ score_profile = (sum(alpha_f * score_f)) * beta_completeness
51
+ """
52
+ # alpha_f: relative importance of fields
53
+ weights = {
54
+ "name": 0.3,
55
+ "locations": 0.4,
56
+ "contacts": 0.3 # phones + emails + socials
57
+ }
58
+
59
+ sum_weighted_scores = 0.0
60
+
61
+ # 1. Name score
62
+ if profile.name:
63
+ sum_weighted_scores += weights["name"] * 1.0
64
+
65
+ # 2. Locations score
66
+ if profile.locations:
67
+ loc_scores = [loc.confidence for loc in profile.locations]
68
+ avg_loc_score = max(loc_scores) if loc_scores else 0.0 # Use max for profile strength
69
+ sum_weighted_scores += weights["locations"] * avg_loc_score
70
+
71
+ # 3. Contacts score
72
+ all_contacts = profile.phones + profile.emails + profile.social_links
73
+ if all_contacts:
74
+ contact_scores = [c.confidence for c in all_contacts]
75
+ avg_contact_score = max(contact_scores) if contact_scores else 0.0
76
+ sum_weighted_scores += weights["contacts"] * avg_contact_score
77
+
78
+ # beta_completeness: Penalty for missing critical fields (Design Doc 12.3.2)
79
+ beta = 1.0
80
+ if not profile.locations:
81
+ beta *= 0.7
82
+ if not profile.phones and not profile.emails:
83
+ beta *= 0.8
84
+ if not profile.categories:
85
+ beta *= 0.9
86
+
87
+ return round(sum_weighted_scores * beta, 2)
88
+
89
+ class WarningRegistry:
90
+ """
91
+ Standardized Warning Codes implementation (Design Doc 12.4.1).
92
+ """
93
+
94
+ def scan_for_warnings(self, profile: OrganizationProfile) -> List[Warning]:
95
+ """
96
+ Scan a profile and generate standardized warnings according to the contract.
97
+ """
98
+ warnings = []
99
+
100
+ # EMPTY_PROFILE
101
+ if not profile.name and not profile.locations and not profile.phones and not profile.emails:
102
+ warnings.append(Warning(
103
+ code="EMPTY_PROFILE",
104
+ message="No significant profile data extracted",
105
+ severity=WarningSeverity.ERROR
106
+ ))
107
+ return warnings
108
+
109
+ # NO_LOCATION_FOUND
110
+ if not profile.locations:
111
+ warnings.append(Warning(
112
+ code="NO_LOCATION_FOUND",
113
+ message="No physical address found in documentation",
114
+ severity=WarningSeverity.WARNING
115
+ ))
116
+ else:
117
+ # ADDRESS_PARTIALLY_PARSED
118
+ for loc in profile.locations:
119
+ if loc.address.raw and not (loc.address.street or loc.address.city):
120
+ warnings.append(Warning(
121
+ code="ADDRESS_PARTIALLY_PARSED",
122
+ message="Address found but only raw string was extracted",
123
+ severity=WarningSeverity.WARNING,
124
+ related_field="locations"
125
+ ))
126
+ break
127
+
128
+ # NO_CONTACT_FOUND
129
+ if not profile.phones and not profile.emails and not profile.social_links:
130
+ warnings.append(Warning(
131
+ code="NO_CONTACT_FOUND",
132
+ message="No telephone, email or social media links found",
133
+ severity=WarningSeverity.WARNING
134
+ ))
135
+
136
+ # CLASSIFICATION_LOW_CONFIDENCE (Aligned with Design Doc 12.4.1)
137
+ if not profile.categories:
138
+ warnings.append(Warning(
139
+ code="CLASSIFICATION_LOW_CONFIDENCE",
140
+ message="Business classification confidence is low or no categories found",
141
+ severity=WarningSeverity.WARNING
142
+ ))
143
+
144
+ # LOW_CONFIDENCE_FIELD
145
+ if profile.confidence and profile.confidence.overall_score < 0.4:
146
+ warnings.append(Warning(
147
+ code="LOW_CONFIDENCE_FIELD",
148
+ message=f"Overall profile confidence is low ({profile.confidence.overall_score})",
149
+ severity=WarningSeverity.WARNING
150
+ ))
151
+
152
+ return warnings