corp-extractor 0.4.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/METADATA +348 -64
- corp_extractor-0.9.0.dist-info/RECORD +76 -0
- statement_extractor/__init__.py +10 -1
- statement_extractor/cli.py +1663 -17
- statement_extractor/data/default_predicates.json +368 -0
- statement_extractor/data/statement_taxonomy.json +6972 -0
- statement_extractor/database/__init__.py +52 -0
- statement_extractor/database/embeddings.py +186 -0
- statement_extractor/database/hub.py +520 -0
- statement_extractor/database/importers/__init__.py +24 -0
- statement_extractor/database/importers/companies_house.py +545 -0
- statement_extractor/database/importers/gleif.py +538 -0
- statement_extractor/database/importers/sec_edgar.py +375 -0
- statement_extractor/database/importers/wikidata.py +1012 -0
- statement_extractor/database/importers/wikidata_people.py +632 -0
- statement_extractor/database/models.py +230 -0
- statement_extractor/database/resolver.py +245 -0
- statement_extractor/database/store.py +1609 -0
- statement_extractor/document/__init__.py +62 -0
- statement_extractor/document/chunker.py +410 -0
- statement_extractor/document/context.py +171 -0
- statement_extractor/document/deduplicator.py +173 -0
- statement_extractor/document/html_extractor.py +246 -0
- statement_extractor/document/loader.py +303 -0
- statement_extractor/document/pipeline.py +388 -0
- statement_extractor/document/summarizer.py +195 -0
- statement_extractor/extractor.py +1 -23
- statement_extractor/gliner_extraction.py +4 -74
- statement_extractor/llm.py +255 -0
- statement_extractor/models/__init__.py +89 -0
- statement_extractor/models/canonical.py +182 -0
- statement_extractor/models/document.py +308 -0
- statement_extractor/models/entity.py +102 -0
- statement_extractor/models/labels.py +220 -0
- statement_extractor/models/qualifiers.py +139 -0
- statement_extractor/models/statement.py +101 -0
- statement_extractor/models.py +4 -1
- statement_extractor/pipeline/__init__.py +39 -0
- statement_extractor/pipeline/config.py +129 -0
- statement_extractor/pipeline/context.py +177 -0
- statement_extractor/pipeline/orchestrator.py +416 -0
- statement_extractor/pipeline/registry.py +303 -0
- statement_extractor/plugins/__init__.py +55 -0
- statement_extractor/plugins/base.py +716 -0
- statement_extractor/plugins/extractors/__init__.py +13 -0
- statement_extractor/plugins/extractors/base.py +9 -0
- statement_extractor/plugins/extractors/gliner2.py +546 -0
- statement_extractor/plugins/labelers/__init__.py +29 -0
- statement_extractor/plugins/labelers/base.py +9 -0
- statement_extractor/plugins/labelers/confidence.py +138 -0
- statement_extractor/plugins/labelers/relation_type.py +87 -0
- statement_extractor/plugins/labelers/sentiment.py +159 -0
- statement_extractor/plugins/labelers/taxonomy.py +386 -0
- statement_extractor/plugins/labelers/taxonomy_embedding.py +477 -0
- statement_extractor/plugins/pdf/__init__.py +10 -0
- statement_extractor/plugins/pdf/pypdf.py +291 -0
- statement_extractor/plugins/qualifiers/__init__.py +30 -0
- statement_extractor/plugins/qualifiers/base.py +9 -0
- statement_extractor/plugins/qualifiers/companies_house.py +185 -0
- statement_extractor/plugins/qualifiers/embedding_company.py +420 -0
- statement_extractor/plugins/qualifiers/gleif.py +197 -0
- statement_extractor/plugins/qualifiers/person.py +785 -0
- statement_extractor/plugins/qualifiers/sec_edgar.py +209 -0
- statement_extractor/plugins/scrapers/__init__.py +10 -0
- statement_extractor/plugins/scrapers/http.py +236 -0
- statement_extractor/plugins/splitters/__init__.py +13 -0
- statement_extractor/plugins/splitters/base.py +9 -0
- statement_extractor/plugins/splitters/t5_gemma.py +293 -0
- statement_extractor/plugins/taxonomy/__init__.py +13 -0
- statement_extractor/plugins/taxonomy/embedding.py +484 -0
- statement_extractor/plugins/taxonomy/mnli.py +291 -0
- statement_extractor/scoring.py +8 -8
- corp_extractor-0.4.0.dist-info/RECORD +0 -12
- {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/WHEEL +0 -0
- {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
"""
|
|
2
|
+
SECEdgarQualifierPlugin - Qualifies US ORG entities with SEC data.
|
|
3
|
+
|
|
4
|
+
DEPRECATED: Use EmbeddingCompanyQualifier instead, which uses a local
|
|
5
|
+
embedding database with pre-loaded SEC Edgar data for faster, offline matching.
|
|
6
|
+
|
|
7
|
+
Uses the SEC EDGAR API to:
|
|
8
|
+
- Look up CIK (Central Index Key) by company name
|
|
9
|
+
- Retrieve ticker symbol, exchange, filing history
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import logging
|
|
13
|
+
import warnings
|
|
14
|
+
from typing import Optional
|
|
15
|
+
|
|
16
|
+
from ..base import BaseQualifierPlugin, PluginCapability
|
|
17
|
+
from ...pipeline.context import PipelineContext
|
|
18
|
+
from ...models import ExtractedEntity, EntityQualifiers, EntityType
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
# SEC EDGAR API endpoints
|
|
23
|
+
SEC_COMPANY_SEARCH = "https://efts.sec.gov/LATEST/search-index"
|
|
24
|
+
SEC_COMPANY_TICKERS = "https://www.sec.gov/files/company_tickers.json"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
# DEPRECATED: Not auto-registered. Use EmbeddingCompanyQualifier instead.
|
|
28
|
+
class SECEdgarQualifierPlugin(BaseQualifierPlugin):
|
|
29
|
+
"""
|
|
30
|
+
DEPRECATED: Use EmbeddingCompanyQualifier instead.
|
|
31
|
+
|
|
32
|
+
Qualifier plugin for US ORG entities using SEC EDGAR.
|
|
33
|
+
Provides CIK and ticker symbol for publicly traded US companies.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
def __init__(
|
|
37
|
+
self,
|
|
38
|
+
timeout: int = 10,
|
|
39
|
+
cache_results: bool = True,
|
|
40
|
+
):
|
|
41
|
+
"""
|
|
42
|
+
Initialize the SEC EDGAR qualifier.
|
|
43
|
+
|
|
44
|
+
DEPRECATED: Use EmbeddingCompanyQualifier instead.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
timeout: API request timeout in seconds
|
|
48
|
+
cache_results: Whether to cache API results
|
|
49
|
+
"""
|
|
50
|
+
warnings.warn(
|
|
51
|
+
"SECEdgarQualifierPlugin is deprecated. Use EmbeddingCompanyQualifier instead.",
|
|
52
|
+
DeprecationWarning,
|
|
53
|
+
stacklevel=2,
|
|
54
|
+
)
|
|
55
|
+
self._timeout = timeout
|
|
56
|
+
self._cache_results = cache_results
|
|
57
|
+
self._cache: dict[str, Optional[dict]] = {}
|
|
58
|
+
self._ticker_cache: Optional[dict] = None
|
|
59
|
+
|
|
60
|
+
@property
|
|
61
|
+
def name(self) -> str:
|
|
62
|
+
return "sec_edgar_qualifier"
|
|
63
|
+
|
|
64
|
+
@property
|
|
65
|
+
def priority(self) -> int:
|
|
66
|
+
return 30 # Run after GLEIF and Companies House
|
|
67
|
+
|
|
68
|
+
@property
|
|
69
|
+
def capabilities(self) -> PluginCapability:
|
|
70
|
+
return PluginCapability.EXTERNAL_API | PluginCapability.CACHING
|
|
71
|
+
|
|
72
|
+
@property
|
|
73
|
+
def description(self) -> str:
|
|
74
|
+
return "Looks up SEC CIK and ticker for US public companies"
|
|
75
|
+
|
|
76
|
+
@property
|
|
77
|
+
def supported_entity_types(self) -> set[EntityType]:
|
|
78
|
+
return {EntityType.ORG}
|
|
79
|
+
|
|
80
|
+
@property
|
|
81
|
+
def supported_identifier_types(self) -> list[str]:
|
|
82
|
+
return ["sec_cik", "ticker"] # Can lookup by CIK or ticker
|
|
83
|
+
|
|
84
|
+
@property
|
|
85
|
+
def provided_identifier_types(self) -> list[str]:
|
|
86
|
+
return ["sec_cik", "ticker"] # Provides CIK and ticker
|
|
87
|
+
|
|
88
|
+
def qualify(
|
|
89
|
+
self,
|
|
90
|
+
entity: ExtractedEntity,
|
|
91
|
+
context: PipelineContext,
|
|
92
|
+
) -> Optional[EntityQualifiers]:
|
|
93
|
+
"""
|
|
94
|
+
Qualify an ORG entity with SEC EDGAR data.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
entity: The ORG entity to qualify
|
|
98
|
+
context: Pipeline context
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
EntityQualifiers with CIK and ticker, or None if not found
|
|
102
|
+
"""
|
|
103
|
+
if entity.type != EntityType.ORG:
|
|
104
|
+
return None
|
|
105
|
+
|
|
106
|
+
# Check cache first
|
|
107
|
+
cache_key = entity.text.lower().strip()
|
|
108
|
+
if self._cache_results and cache_key in self._cache:
|
|
109
|
+
cached = self._cache[cache_key]
|
|
110
|
+
if cached is None:
|
|
111
|
+
return None
|
|
112
|
+
return self._data_to_qualifiers(cached)
|
|
113
|
+
|
|
114
|
+
# Search SEC
|
|
115
|
+
result = self._search_sec(entity.text)
|
|
116
|
+
|
|
117
|
+
# Cache result
|
|
118
|
+
if self._cache_results:
|
|
119
|
+
self._cache[cache_key] = result
|
|
120
|
+
|
|
121
|
+
if result:
|
|
122
|
+
return self._data_to_qualifiers(result)
|
|
123
|
+
|
|
124
|
+
return None
|
|
125
|
+
|
|
126
|
+
def _load_ticker_cache(self) -> dict:
|
|
127
|
+
"""Load the SEC company tickers JSON (cached)."""
|
|
128
|
+
if self._ticker_cache is not None:
|
|
129
|
+
return self._ticker_cache
|
|
130
|
+
|
|
131
|
+
try:
|
|
132
|
+
import requests
|
|
133
|
+
|
|
134
|
+
response = requests.get(SEC_COMPANY_TICKERS, timeout=self._timeout)
|
|
135
|
+
response.raise_for_status()
|
|
136
|
+
data = response.json()
|
|
137
|
+
|
|
138
|
+
# Build lookup by company name (lowercase)
|
|
139
|
+
self._ticker_cache = {}
|
|
140
|
+
for key, company in data.items():
|
|
141
|
+
name = company.get("title", "").lower()
|
|
142
|
+
if name:
|
|
143
|
+
self._ticker_cache[name] = {
|
|
144
|
+
"cik": str(company.get("cik_str", "")),
|
|
145
|
+
"ticker": company.get("ticker", ""),
|
|
146
|
+
"title": company.get("title", ""),
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
logger.debug(f"Loaded {len(self._ticker_cache)} SEC company tickers")
|
|
150
|
+
return self._ticker_cache
|
|
151
|
+
|
|
152
|
+
except Exception as e:
|
|
153
|
+
logger.debug(f"Failed to load SEC ticker cache: {e}")
|
|
154
|
+
self._ticker_cache = {}
|
|
155
|
+
return self._ticker_cache
|
|
156
|
+
|
|
157
|
+
def _search_sec(self, org_name: str) -> Optional[dict]:
|
|
158
|
+
"""Search SEC for company information."""
|
|
159
|
+
try:
|
|
160
|
+
# Load ticker cache
|
|
161
|
+
ticker_cache = self._load_ticker_cache()
|
|
162
|
+
|
|
163
|
+
# Try exact match first
|
|
164
|
+
org_lower = org_name.lower().strip()
|
|
165
|
+
if org_lower in ticker_cache:
|
|
166
|
+
return ticker_cache[org_lower]
|
|
167
|
+
|
|
168
|
+
# Try partial match
|
|
169
|
+
for name, data in ticker_cache.items():
|
|
170
|
+
if org_lower in name or name in org_lower:
|
|
171
|
+
return data
|
|
172
|
+
|
|
173
|
+
# Try matching without common suffixes
|
|
174
|
+
clean_name = org_lower
|
|
175
|
+
for suffix in [" inc", " inc.", " corp", " corp.", " co", " co.", " ltd", " llc"]:
|
|
176
|
+
clean_name = clean_name.replace(suffix, "")
|
|
177
|
+
clean_name = clean_name.strip()
|
|
178
|
+
|
|
179
|
+
for name, data in ticker_cache.items():
|
|
180
|
+
clean_cached = name
|
|
181
|
+
for suffix in [" inc", " inc.", " corp", " corp.", " co", " co.", " ltd", " llc"]:
|
|
182
|
+
clean_cached = clean_cached.replace(suffix, "")
|
|
183
|
+
clean_cached = clean_cached.strip()
|
|
184
|
+
|
|
185
|
+
if clean_name == clean_cached or clean_name in clean_cached or clean_cached in clean_name:
|
|
186
|
+
return data
|
|
187
|
+
|
|
188
|
+
except Exception as e:
|
|
189
|
+
logger.debug(f"SEC search error: {e}")
|
|
190
|
+
|
|
191
|
+
return None
|
|
192
|
+
|
|
193
|
+
def _data_to_qualifiers(self, data: dict) -> EntityQualifiers:
|
|
194
|
+
"""Convert SEC data to EntityQualifiers."""
|
|
195
|
+
identifiers = {}
|
|
196
|
+
if data.get("cik"):
|
|
197
|
+
identifiers["sec_cik"] = data["cik"]
|
|
198
|
+
if data.get("ticker"):
|
|
199
|
+
identifiers["ticker"] = data["ticker"]
|
|
200
|
+
|
|
201
|
+
return EntityQualifiers(
|
|
202
|
+
jurisdiction="US",
|
|
203
|
+
country="US",
|
|
204
|
+
identifiers=identifiers,
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
# Allow importing without decorator for testing
|
|
209
|
+
SECEdgarQualifierPluginClass = SECEdgarQualifierPlugin
|
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
"""
|
|
2
|
+
HTTP scraper plugin for fetching web content.
|
|
3
|
+
|
|
4
|
+
Uses httpx for async HTTP requests with retries, timeouts, and CAPTCHA detection.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import logging
|
|
8
|
+
from typing import Optional
|
|
9
|
+
|
|
10
|
+
from ..base import BaseScraperPlugin, ContentType, ScraperResult
|
|
11
|
+
from ...pipeline.registry import PluginRegistry
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@PluginRegistry.scraper
|
|
17
|
+
class HttpScraperPlugin(BaseScraperPlugin):
|
|
18
|
+
"""
|
|
19
|
+
Default HTTP scraper using httpx with retries and timeouts.
|
|
20
|
+
|
|
21
|
+
Features:
|
|
22
|
+
- Async HTTP requests with httpx
|
|
23
|
+
- Automatic redirect following
|
|
24
|
+
- Content type detection from headers and URL
|
|
25
|
+
- CAPTCHA page detection
|
|
26
|
+
- Configurable timeout and retries
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
def __init__(
|
|
30
|
+
self,
|
|
31
|
+
timeout: float = 30.0,
|
|
32
|
+
max_retries: int = 3,
|
|
33
|
+
user_agent: str = "Mozilla/5.0 (compatible; StatementExtractor/1.0; +https://github.com/corp-o-rate/statement-extractor)",
|
|
34
|
+
follow_redirects: bool = True,
|
|
35
|
+
):
|
|
36
|
+
self._timeout = timeout
|
|
37
|
+
self._max_retries = max_retries
|
|
38
|
+
self._user_agent = user_agent
|
|
39
|
+
self._follow_redirects = follow_redirects
|
|
40
|
+
|
|
41
|
+
@property
|
|
42
|
+
def name(self) -> str:
|
|
43
|
+
return "http_scraper"
|
|
44
|
+
|
|
45
|
+
@property
|
|
46
|
+
def priority(self) -> int:
|
|
47
|
+
return 100 # Default scraper
|
|
48
|
+
|
|
49
|
+
@property
|
|
50
|
+
def description(self) -> str:
|
|
51
|
+
return "Default HTTP scraper using httpx with retries and CAPTCHA detection"
|
|
52
|
+
|
|
53
|
+
async def fetch(self, url: str, timeout: Optional[float] = None) -> ScraperResult:
|
|
54
|
+
"""
|
|
55
|
+
Fetch content from a URL with retries and CAPTCHA detection.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
url: The URL to fetch
|
|
59
|
+
timeout: Request timeout in seconds (uses instance default if None)
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
ScraperResult with content, content type, and any errors
|
|
63
|
+
"""
|
|
64
|
+
import httpx
|
|
65
|
+
|
|
66
|
+
timeout = timeout or self._timeout
|
|
67
|
+
last_error: Optional[str] = None
|
|
68
|
+
|
|
69
|
+
for attempt in range(self._max_retries):
|
|
70
|
+
try:
|
|
71
|
+
async with httpx.AsyncClient(
|
|
72
|
+
timeout=timeout,
|
|
73
|
+
follow_redirects=self._follow_redirects,
|
|
74
|
+
) as client:
|
|
75
|
+
logger.debug(f"Fetching URL: {url} (attempt {attempt + 1})")
|
|
76
|
+
|
|
77
|
+
response = await client.get(
|
|
78
|
+
url,
|
|
79
|
+
headers={"User-Agent": self._user_agent},
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
content_type = self._detect_content_type(
|
|
83
|
+
dict(response.headers), url
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
# Check for CAPTCHA if HTML
|
|
87
|
+
error = None
|
|
88
|
+
if content_type == ContentType.HTML:
|
|
89
|
+
if self._is_captcha_page(response.content):
|
|
90
|
+
error = "CAPTCHA or challenge page detected"
|
|
91
|
+
logger.warning(f"CAPTCHA detected at {url}")
|
|
92
|
+
|
|
93
|
+
return ScraperResult(
|
|
94
|
+
url=url,
|
|
95
|
+
final_url=str(response.url),
|
|
96
|
+
content=response.content,
|
|
97
|
+
content_type=content_type,
|
|
98
|
+
headers=dict(response.headers),
|
|
99
|
+
error=error,
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
except httpx.TimeoutException as e:
|
|
103
|
+
last_error = f"Request timed out after {timeout}s"
|
|
104
|
+
logger.warning(f"Timeout fetching {url}: {e}")
|
|
105
|
+
except httpx.ConnectError as e:
|
|
106
|
+
last_error = f"Connection error: {e}"
|
|
107
|
+
logger.warning(f"Connection error fetching {url}: {e}")
|
|
108
|
+
except httpx.HTTPStatusError as e:
|
|
109
|
+
last_error = f"HTTP {e.response.status_code}: {e.response.reason_phrase}"
|
|
110
|
+
logger.warning(f"HTTP error fetching {url}: {e}")
|
|
111
|
+
# Don't retry on 4xx errors
|
|
112
|
+
if 400 <= e.response.status_code < 500:
|
|
113
|
+
break
|
|
114
|
+
except Exception as e:
|
|
115
|
+
last_error = f"Unexpected error: {e}"
|
|
116
|
+
logger.exception(f"Error fetching {url}")
|
|
117
|
+
|
|
118
|
+
# All retries failed
|
|
119
|
+
return ScraperResult(
|
|
120
|
+
url=url,
|
|
121
|
+
final_url=url,
|
|
122
|
+
content=b"",
|
|
123
|
+
content_type=ContentType.UNKNOWN,
|
|
124
|
+
error=last_error or "Unknown error",
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
async def head(self, url: str, timeout: Optional[float] = None) -> ScraperResult:
|
|
128
|
+
"""
|
|
129
|
+
Check content type without downloading the full body.
|
|
130
|
+
|
|
131
|
+
Args:
|
|
132
|
+
url: The URL to check
|
|
133
|
+
timeout: Request timeout in seconds
|
|
134
|
+
|
|
135
|
+
Returns:
|
|
136
|
+
ScraperResult with content_type populated (content is empty)
|
|
137
|
+
"""
|
|
138
|
+
import httpx
|
|
139
|
+
|
|
140
|
+
timeout = timeout or self._timeout
|
|
141
|
+
|
|
142
|
+
try:
|
|
143
|
+
async with httpx.AsyncClient(
|
|
144
|
+
timeout=timeout,
|
|
145
|
+
follow_redirects=self._follow_redirects,
|
|
146
|
+
) as client:
|
|
147
|
+
response = await client.head(
|
|
148
|
+
url,
|
|
149
|
+
headers={"User-Agent": self._user_agent},
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
content_type = self._detect_content_type(
|
|
153
|
+
dict(response.headers), url
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
return ScraperResult(
|
|
157
|
+
url=url,
|
|
158
|
+
final_url=str(response.url),
|
|
159
|
+
content=b"",
|
|
160
|
+
content_type=content_type,
|
|
161
|
+
headers=dict(response.headers),
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
except Exception as e:
|
|
165
|
+
logger.warning(f"HEAD request failed for {url}: {e}")
|
|
166
|
+
# Fall back to full fetch
|
|
167
|
+
return await self.fetch(url, timeout)
|
|
168
|
+
|
|
169
|
+
@staticmethod
|
|
170
|
+
def _detect_content_type(headers: dict[str, str], url: str) -> ContentType:
|
|
171
|
+
"""
|
|
172
|
+
Detect content type from HTTP headers and URL.
|
|
173
|
+
|
|
174
|
+
Priority:
|
|
175
|
+
1. Content-Type header
|
|
176
|
+
2. URL file extension
|
|
177
|
+
"""
|
|
178
|
+
content_type_header = headers.get("content-type", "").lower()
|
|
179
|
+
|
|
180
|
+
# Check Content-Type header
|
|
181
|
+
if "application/pdf" in content_type_header:
|
|
182
|
+
return ContentType.PDF
|
|
183
|
+
if any(mime in content_type_header for mime in [
|
|
184
|
+
"text/html",
|
|
185
|
+
"application/xhtml+xml",
|
|
186
|
+
]):
|
|
187
|
+
return ContentType.HTML
|
|
188
|
+
|
|
189
|
+
# Check URL extension
|
|
190
|
+
url_lower = url.lower().split("?")[0] # Remove query params
|
|
191
|
+
if url_lower.endswith(".pdf"):
|
|
192
|
+
return ContentType.PDF
|
|
193
|
+
if url_lower.endswith((".html", ".htm")):
|
|
194
|
+
return ContentType.HTML
|
|
195
|
+
|
|
196
|
+
# Default based on content-type
|
|
197
|
+
if content_type_header.startswith("text/"):
|
|
198
|
+
return ContentType.HTML
|
|
199
|
+
if content_type_header.startswith(("image/", "audio/", "video/")):
|
|
200
|
+
return ContentType.BINARY
|
|
201
|
+
|
|
202
|
+
return ContentType.UNKNOWN
|
|
203
|
+
|
|
204
|
+
@staticmethod
|
|
205
|
+
def _is_captcha_page(content: bytes) -> bool:
|
|
206
|
+
"""
|
|
207
|
+
Detect CAPTCHA or challenge pages.
|
|
208
|
+
|
|
209
|
+
Checks for common CAPTCHA patterns in HTML content.
|
|
210
|
+
"""
|
|
211
|
+
try:
|
|
212
|
+
html = content.decode("utf-8", errors="replace").lower()
|
|
213
|
+
except Exception:
|
|
214
|
+
return False
|
|
215
|
+
|
|
216
|
+
# Only check small pages (challenge pages are usually small)
|
|
217
|
+
if len(html) > 50000:
|
|
218
|
+
return False
|
|
219
|
+
|
|
220
|
+
# Common CAPTCHA/challenge indicators
|
|
221
|
+
captcha_patterns = [
|
|
222
|
+
"captcha",
|
|
223
|
+
"cloudflare",
|
|
224
|
+
"checking your browser",
|
|
225
|
+
"please verify you are a human",
|
|
226
|
+
"access denied",
|
|
227
|
+
"bot protection",
|
|
228
|
+
"ddos protection",
|
|
229
|
+
"just a moment",
|
|
230
|
+
"enable javascript",
|
|
231
|
+
"please enable cookies",
|
|
232
|
+
"verify you are human",
|
|
233
|
+
"security check",
|
|
234
|
+
]
|
|
235
|
+
|
|
236
|
+
return any(pattern in html for pattern in captcha_patterns)
|