PyPI - contextractor - Versions diffs - 0.3.7__py3-none-any.whl - Mend

contextractor 0.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

contextractor-0.3.7.dist-info/METADATA +260 -0
contextractor-0.3.7.dist-info/RECORD +14 -0
contextractor-0.3.7.dist-info/WHEEL +4 -0
contextractor-0.3.7.dist-info/entry_points.txt +2 -0
contextractor_cli/__init__.py +1 -0
contextractor_cli/__main__.py +5 -0
contextractor_cli/config.py +159 -0
contextractor_cli/crawler.py +323 -0
contextractor_cli/main.py +333 -0
contextractor_engine/__init__.py +22 -0
contextractor_engine/extractor.py +70 -0
contextractor_engine/models.py +126 -0
contextractor_engine/py.typed +0 -0
contextractor_engine/utils.py +34 -0

contextractor-0.3.7.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,260 @@
+Metadata-Version: 2.4
+Name: contextractor
+Version: 0.3.7
+Summary: Extract clean, readable content from any website
+Project-URL: Homepage, https://contextractor.com
+Project-URL: Repository, https://github.com/contextractor/contextractor
+Project-URL: Issues, https://github.com/contextractor/contextractor/issues
+Author-email: Miroslav Sekera <miroslav@glueo.com>
+License-Expression: Apache-2.0
+Requires-Python: >=3.12
+Requires-Dist: browserforge<1.2.4
+Requires-Dist: crawlee[playwright]>=0.4.0
+Requires-Dist: pyyaml>=6.0
+Requires-Dist: trafilatura>=2.0.0
+Requires-Dist: typer>=0.15.0
+Description-Content-Type: text/markdown
+# Contextractor
+Extract clean, readable content from any website using [Trafilatura](https://trafilatura.readthedocs.io/).
+Available as: [pip](#install) | [npm](#install) | [Docker](#docker) | [Apify actor](https://apify.com/glueo/contextractor)
+Try the [Playground](https://contextractor.com) to configure extraction settings and preview commands before running.
+## Install
+```bash
+pip install contextractor
+```
+or
+```bash
+npm install -g contextractor
+```
+Requires Python 3.12+ (pip) or Node.js 18+ (npm). Playwright Chromium is installed automatically.
+## Usage
+```bash
+contextractor https://example.com
+```
+Works with zero config. Pass URLs directly, or use a config file for complex setups:
+```bash
+contextractor https://example.com --precision --save-json -o ./results
+contextractor --config config.json --max-pages 10
+```
+### CLI Options
+```
+contextractor [OPTIONS] [URLS...]
+Crawl Settings:
+  --config, -c          Path to JSON config file
+  --output-dir, -o      Output directory
+  --max-pages           Max pages to crawl (0 = unlimited)
+  --crawl-depth         Max link depth from start URLs (0 = start only)
+  --headless/--no-headless  Browser headless mode (default: headless)
+  --max-concurrency     Max parallel requests (default: 50)
+  --max-retries         Max request retries (default: 3)
+  --max-results         Max results per crawl (0 = unlimited)
+Proxy:
+  --proxy-urls          Comma-separated proxy URLs (http://user:pass@host:port)
+  --proxy-rotation      Rotation: recommended, per_request, until_failure
+Browser:
+  --launcher            Browser engine: chromium, firefox (default: chromium)
+  --wait-until          Page load event: load, networkidle, domcontentloaded (default: load)
+  --page-load-timeout   Timeout in seconds (default: 60)
+  --ignore-cors         Disable CORS/CSP restrictions
+  --close-cookie-modals Auto-dismiss cookie banners
+  --max-scroll-height   Max scroll height in pixels (default: 5000)
+  --ignore-ssl-errors   Skip SSL certificate verification
+  --user-agent          Custom User-Agent string
+Crawl Filtering:
+  --globs               Comma-separated glob patterns to include
+  --excludes            Comma-separated glob patterns to exclude
+  --link-selector       CSS selector for links to follow
+  --keep-url-fragments  Preserve URL fragments
+  --respect-robots-txt  Honor robots.txt
+Cookies & Headers:
+  --cookies             JSON array of cookie objects
+  --headers             JSON object of custom HTTP headers
+Output Toggles:
+  --save-markdown/--no-save-markdown  Save extracted markdown (default: true)
+  --save-raw-html       Save raw HTML to output
+  --save-text           Save extracted text
+  --save-json           Save extracted JSON
+  --save-jsonl          Save all pages as JSONL (single file)
+  --save-xml            Save extracted XML
+  --save-xml-tei        Save extracted XML-TEI
+Content Extraction:
+  --precision           High precision mode (less noise)
+  --recall              High recall mode (more content)
+  --fast                Fast extraction mode (less thorough)
+  --no-links            Exclude links from output
+  --no-comments         Exclude comments from output
+  --include-tables/--no-tables  Include tables (default: include)
+  --include-images      Include image descriptions
+  --include-formatting/--no-formatting  Preserve formatting (default: preserve)
+  --deduplicate         Deduplicate extracted content
+  --target-language     Filter by language (e.g. "en")
+  --with-metadata/--no-metadata  Extract metadata (default: with)
+  --prune-xpath         XPath patterns to remove from content
+Diagnostics:
+  --verbose, -v         Enable verbose logging
+```
+CLI flags override config file settings. Merge order: `defaults → config file → CLI args`
+### Config File (optional)
+Use a JSON config file to set options:
+```json
+{
+  "urls": ["https://example.com", "https://docs.example.com"],
+  "saveMarkdown": true,
+  "outputDir": "./output",
+  "crawlDepth": 1,
+  "proxy": {
+    "urls": ["http://user:pass@host:port"],
+    "rotation": "recommended"
+  },
+  "extraction": {
+    "favorPrecision": true,
+    "includeLinks": true,
+    "includeTables": true,
+    "deduplicate": true
+  }
+}
+```
+### Crawl Settings
+| Field | Type | Default | Description |
+|-------|------|---------|-------------|
+| `urls` | array | `[]` | URLs to extract content from |
+| `maxPages` | int | 0 | Max pages to crawl (0 = unlimited) |
+| `outputDir` | string | `"./output"` | Directory for extracted content |
+| `crawlDepth` | int | 0 | How deep to follow links (0 = start URLs only) |
+| `headless` | bool | true | Browser headless mode |
+| `maxConcurrency` | int | 50 | Max parallel browser pages |
+| `maxRetries` | int | 3 | Max retries for failed requests |
+| `maxResults` | int | 0 | Max results per crawl (0 = unlimited) |
+### Proxy Configuration
+| Field | Type | Default | Description |
+|-------|------|---------|-------------|
+| `proxy.urls` | array | `[]` | Proxy URLs (`http://user:pass@host:port` or `socks5://host:port`) |
+| `proxy.rotation` | string | `"recommended"` | `recommended`, `per_request`, `until_failure` |
+| `proxy.tiered` | array | `[]` | Tiered proxy escalation (config-file only) |
+### Browser Settings
+| Field | Type | Default | Description |
+|-------|------|---------|-------------|
+| `launcher` | string | `"chromium"` | Browser engine: `chromium`, `firefox` |
+| `waitUntil` | string | `"load"` | Page load event: `load`, `networkidle`, `domcontentloaded` |
+| `pageLoadTimeout` | int | 60 | Page load timeout in seconds |
+| `ignoreCors` | bool | false | Disable CORS/CSP restrictions |
+| `closeCookieModals` | bool | true | Auto-dismiss cookie consent banners |
+| `maxScrollHeight` | int | 5000 | Max scroll height in pixels (0 = disable) |
+| `ignoreSslErrors` | bool | false | Skip SSL certificate verification |
+| `userAgent` | string | `""` | Custom User-Agent string |
+### Crawl Filtering
+| Field | Type | Default | Description |
+|-------|------|---------|-------------|
+| `globs` | array | `[]` | Glob patterns for URLs to include |
+| `excludes` | array | `[]` | Glob patterns for URLs to exclude |
+| `linkSelector` | string | `""` | CSS selector for links to follow |
+| `keepUrlFragments` | bool | false | Treat URLs with different fragments as different pages |
+| `respectRobotsTxt` | bool | false | Honor robots.txt |
+### Cookies & Headers
+| Field | Type | Default | Description |
+|-------|------|---------|-------------|
+| `cookies` | array | `[]` | Initial cookies (`[{"name": "...", "value": "...", "domain": "..."}]`) |
+| `headers` | object | `{}` | Custom HTTP headers (`{"Authorization": "Bearer token"}`) |
+### Output Toggles
+Each toggle saves its format independently. Multiple can be enabled at once:
+| Field | Type | Default | Description |
+|-------|------|---------|-------------|
+| `saveMarkdown` | bool | true | Save extracted markdown |
+| `saveRawHtml` | bool | false | Save raw HTML |
+| `saveText` | bool | false | Save extracted plain text |
+| `saveJson` | bool | false | Save extracted JSON |
+| `saveJsonl` | bool | false | Save all pages as JSONL (single file) |
+| `saveXml` | bool | false | Save extracted XML |
+| `saveXmlTei` | bool | false | Save extracted XML-TEI |
+### Content Extraction
+All options go under the `extraction` key in config files, or use the equivalent CLI flags:
+| Field | Type | Default | Description |
+|-------|------|---------|-------------|
+| `favorPrecision` | bool | false | High precision, less noise |
+| `favorRecall` | bool | false | High recall, more content |
+| `includeComments` | bool | true | Include comments |
+| `includeTables` | bool | true | Include tables |
+| `includeImages` | bool | false | Include images |
+| `includeFormatting` | bool | true | Preserve formatting |
+| `includeLinks` | bool | true | Include links |
+| `deduplicate` | bool | false | Deduplicate content |
+| `withMetadata` | bool | true | Extract metadata (title, author, date) |
+| `targetLanguage` | string | null | Filter by language (e.g. `"en"`) |
+| `fast` | bool | false | Fast mode (less thorough) |
+| `pruneXpath` | array | null | XPath patterns to remove from content |
+## Docker
+```bash
+docker run ghcr.io/contextractor/contextractor https://example.com
+```
+Save output to your local machine:
+```bash
+docker run -v ./output:/output ghcr.io/contextractor/contextractor https://example.com -o /output
+```
+Use a config file:
+```bash
+docker run -v ./config.json:/config.json ghcr.io/contextractor/contextractor --config /config.json
+```
+All CLI flags work the same inside Docker.
+## Output
+One file per crawled page, named from the URL slug (e.g. `example-com-page.md`). Metadata (title, author, date) is included in the output header when available.
+## Platforms
+- npm: macOS arm64, Linux (x64, arm64), Windows x64
+- Docker: linux/amd64, linux/arm64
+## License
+Apache-2.0

contextractor-0.3.7.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,14 @@
+contextractor_cli/__init__.py,sha256=jO_XdncP1L1FKAbLCKnkAgwYh_ou9LhYFwBZL9YBZuY,66
+contextractor_cli/__main__.py,sha256=hRU0ca8FU89pZuU0ZuyIYp6DrgKD_toAC0TM-F9GGrg,81
+contextractor_cli/config.py,sha256=ksdOFvabIHt7Me2Ov-A9S7VqVj-Bx0apLyHTYDxoXcc,6372
+contextractor_cli/crawler.py,sha256=ofXzue7SYC8_88u5TakxLzwmPMkPVNcFjd25UkmJc6c,12456
+contextractor_cli/main.py,sha256=z11ZV5Gpt8Z1BCECYXoAJouGXCcP57su51Ra3bUslJI,11471
+contextractor_engine/__init__.py,sha256=el3zmnxW5l6b5bj79hGZ95gC4CFjM0eLNv7b60RNw3I,596
+contextractor_engine/extractor.py,sha256=_gRPl5FomRJvZquFDXuyLLLznhNFbZQH2zBVUtVaivQ,2478
+contextractor_engine/models.py,sha256=XX14iZt19nxQLfGRDcXf0me6KKbTjQMw-RX5QjOUXFo,4212
+contextractor_engine/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+contextractor_engine/utils.py,sha256=ma8NWrKficfQejWnnSDEA6WrEWqqEYolbtaurl7t1MA,1149
+contextractor-0.3.7.dist-info/METADATA,sha256=NTGMXnYT7a-4l8FsH9n3oiZhnxCRh3IaCOm8j4CEd98,9481
+contextractor-0.3.7.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
+contextractor-0.3.7.dist-info/entry_points.txt,sha256=jpnlbk9gdLVAReuc9--8SumbPbpgIe10DrGvlx3d2Co,61
+contextractor-0.3.7.dist-info/RECORD,,

contextractor-0.3.7.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: hatchling 1.29.0
+Root-Is-Purelib: true
+Tag: py3-none-any

contextractor-0.3.7.dist-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ [console_scripts]
2	+ contextractor = contextractor_cli.main:app

contextractor_cli/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """Contextractor CLI - Standalone web content extraction tool."""

contextractor_cli/__main__.py ADDED Viewed

@@ -0,0 +1,5 @@
+"""Entry point for python -m contextractor_cli."""
+from .main import app
+app()

contextractor_cli/config.py ADDED Viewed

@@ -0,0 +1,159 @@
+"""Configuration loading from JSON files (YAML also supported)."""
+from __future__ import annotations
+import json
+from dataclasses import dataclass, field, fields as dataclass_fields
+from pathlib import Path
+from typing import Any
+import yaml
+from contextractor_engine import TrafilaturaConfig
+# TrafilaturaConfig field names for routing in merge()
+_EXTRACTION_FIELDS = {f.name for f in dataclass_fields(TrafilaturaConfig)}
+@dataclass
+class CrawlConfig:
+    """Configuration for a crawl run."""
+    # Core
+    urls: list[str] = field(default_factory=list)
+    max_pages: int = 0
+    output_dir: str = "./output"
+    crawl_depth: int = 0
+    headless: bool = True
+    extraction: TrafilaturaConfig = field(default_factory=TrafilaturaConfig.balanced)
+    # Proxy
+    proxy_urls: list[str] = field(default_factory=list)
+    proxy_rotation: str = "recommended"
+    proxy_tiered: list[list[str | None]] = field(default_factory=list)
+    # Browser
+    launcher: str = "chromium"
+    wait_until: str = "load"
+    page_load_timeout: int = 60
+    ignore_cors: bool = False
+    close_cookie_modals: bool = True
+    max_scroll_height: int = 5000
+    ignore_ssl_errors: bool = False
+    user_agent: str = ""
+    # Crawl filtering
+    globs: list[str] = field(default_factory=list)
+    excludes: list[str] = field(default_factory=list)
+    link_selector: str = ""
+    keep_url_fragments: bool = False
+    respect_robots_txt: bool = False
+    # Cookies & headers
+    cookies: list[dict[str, Any]] = field(default_factory=list)
+    headers: dict[str, str] = field(default_factory=dict)
+    # Concurrency & retries
+    max_concurrency: int = 50
+    max_retries: int = 3
+    max_results: int = 0
+    # Output toggles
+    save_markdown: bool = True
+    save_raw_html: bool = False
+    save_text: bool = False
+    save_json: bool = False
+    save_jsonl: bool = False
+    save_xml: bool = False
+    save_xml_tei: bool = False
+    @classmethod
+    def from_file(cls, path: Path) -> CrawlConfig:
+        """Load config from a JSON (or YAML) file."""
+        text = path.read_text(encoding="utf-8")
+        if path.suffix in (".yaml", ".yml"):
+            data = yaml.safe_load(text) or {}
+        elif path.suffix == ".json":
+            data = json.loads(text)
+        else:
+            # Try YAML first, fall back to JSON
+            try:
+                data = yaml.safe_load(text) or {}
+            except yaml.YAMLError:
+                data = json.loads(text)
+        return cls.from_dict(data)
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> CrawlConfig:
+        """Create config from a dictionary."""
+        extraction = TrafilaturaConfig.from_json_dict(data.get("extraction"))
+        # Parse proxy section (nested object or flat keys)
+        proxy_section = data.get("proxy", {})
+        proxy_urls = proxy_section.get("urls", []) if isinstance(proxy_section, dict) else []
+        proxy_rotation = proxy_section.get("rotation", "recommended") if isinstance(proxy_section, dict) else "recommended"
+        # Parse cookies/headers (can be nested or flat)
+        cookies = data.get("initialCookies", data.get("cookies", []))
+        headers = data.get("customHttpHeaders", data.get("headers", {}))
+        return cls(
+            urls=data.get("urls", []),
+            max_pages=data.get("maxPages", 0),
+            output_dir=data.get("outputDir", "./output"),
+            crawl_depth=data.get("crawlDepth", 0),
+            headless=data.get("headless", True),
+            extraction=extraction,
+            # Proxy
+            proxy_urls=proxy_urls,
+            proxy_rotation=proxy_rotation,
+            proxy_tiered=proxy_section.get("tiered", []) if isinstance(proxy_section, dict) else [],
+            # Browser
+            launcher=data.get("launcher", "chromium").lower(),
+            wait_until=data.get("waitUntil", "load").lower(),
+            page_load_timeout=data.get("pageLoadTimeoutSecs", data.get("pageLoadTimeout", 60)),
+            ignore_cors=data.get("ignoreCorsAndCsp", data.get("ignoreCors", False)),
+            close_cookie_modals=data.get("closeCookieModals", True),
+            max_scroll_height=data.get("maxScrollHeightPixels", data.get("maxScrollHeight", 5000)),
+            ignore_ssl_errors=data.get("ignoreSslErrors", False),
+            user_agent=data.get("userAgent", ""),
+            # Crawl filtering
+            globs=data.get("globs", []),
+            excludes=data.get("excludes", []),
+            link_selector=data.get("linkSelector", ""),
+            keep_url_fragments=data.get("keepUrlFragments", False),
+            respect_robots_txt=data.get("respectRobotsTxtFile", data.get("respectRobotsTxt", False)),
+            # Cookies & headers
+            cookies=cookies,
+            headers=headers,
+            # Concurrency & retries
+            max_concurrency=data.get("maxConcurrency", 50),
+            max_retries=data.get("maxRequestRetries", data.get("maxRetries", 3)),
+            max_results=data.get("maxResultsPerCrawl", data.get("maxResults", 0)),
+            # Output toggles
+            save_markdown=data.get("saveMarkdown", data.get("saveExtractedMarkdownToKeyValueStore", True)),
+            save_raw_html=data.get("saveRawHtml", data.get("saveRawHtmlToKeyValueStore", False)),
+            save_text=data.get("saveText", data.get("saveExtractedTextToKeyValueStore", False)),
+            save_json=data.get("saveJson", data.get("saveExtractedJsonToKeyValueStore", False)),
+            save_jsonl=data.get("saveJsonl", False),
+            save_xml=data.get("saveXml", data.get("saveExtractedXmlToKeyValueStore", False)),
+            save_xml_tei=data.get("saveXmlTei", data.get("saveExtractedXmlTeiToKeyValueStore", False)),
+        )
+    def merge(self, overrides: dict[str, Any]) -> None:
+        """Merge non-None overrides into this config.
+        Keys matching TrafilaturaConfig fields are routed to self.extraction.
+        Keys matching CrawlConfig fields are set directly.
+        Unknown keys are ignored.
+        """
+        crawl_fields = {f.name for f in dataclass_fields(self)} - {"extraction"}
+        for key, value in overrides.items():
+            if value is None:
+                continue
+            if key in crawl_fields:
+                setattr(self, key, value)
+            elif key in _EXTRACTION_FIELDS:
+                setattr(self.extraction, key, value)

contextractor_cli/crawler.py ADDED Viewed

@@ -0,0 +1,323 @@
+"""Crawling and content extraction using crawlee."""
+from __future__ import annotations
+import hashlib
+import json
+import logging
+import os
+import re
+from datetime import timedelta
+from pathlib import Path
+from typing import Any
+from crawlee import Request
+from crawlee._autoscaling.autoscaled_pool import ConcurrencySettings
+from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
+from crawlee.crawlers._playwright._types import GotoOptions
+from crawlee.proxy_configuration import ProxyConfiguration
+from contextractor_engine import ContentExtractor
+from .config import CrawlConfig
+logger = logging.getLogger("contextractor")
+FORMAT_EXTENSIONS = {
+    "txt": ".txt",
+    "markdown": ".md",
+    "json": ".json",
+    "xml": ".xml",
+    "xmltei": ".tei.xml",
+}
+def _url_to_filename(url: str) -> str:
+    """Convert a URL to a safe filename slug."""
+    # Remove protocol
+    slug = re.sub(r"^https?://", "", url)
+    # Replace non-alphanumeric chars with hyphens
+    slug = re.sub(r"[^a-zA-Z0-9]+", "-", slug)
+    # Remove leading/trailing hyphens
+    slug = slug.strip("-")
+    # Truncate and add hash for uniqueness
+    if len(slug) > 100:
+        url_hash = hashlib.md5(url.encode()).hexdigest()[:8]
+        slug = f"{slug[:100]}-{url_hash}"
+    return slug
+def _build_browser_launch_options(config: CrawlConfig) -> dict[str, Any]:
+    """Build browser launch options from config."""
+    options: dict[str, Any] = {}
+    args = []
+    # Anti-detection: prevent navigator.webdriver=true (Chromium only)
+    if config.launcher == "chromium":
+        args.append("--disable-blink-features=AutomationControlled")
+    # Disable Chromium sandbox in Docker (set CONTEXTRACTOR_NO_SANDBOX=1)
+    if os.environ.get("CONTEXTRACTOR_NO_SANDBOX"):
+        args.append("--no-sandbox")
+    if args:
+        options["args"] = args
+    if config.ignore_ssl_errors:
+        options["ignore_https_errors"] = True
+    return options
+def _build_browser_context_options(config: CrawlConfig) -> dict[str, Any] | None:
+    """Build browser context options from config."""
+    options: dict[str, Any] = {}
+    if config.ignore_cors:
+        options["bypass_csp"] = True
+    if config.cookies:
+        options["storage_state"] = {"cookies": config.cookies}
+    if config.headers:
+        options["extra_http_headers"] = config.headers
+    if config.user_agent:
+        options["user_agent"] = config.user_agent
+    return options if options else None
+async def run_crawl(config: CrawlConfig) -> None:
+    """Run the crawl with the given configuration."""
+    output_dir = Path(config.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    extractor = ContentExtractor(config=config.extraction)
+    pages_extracted = 0
+    max_results = config.max_results
+    # Configure proxy (tiered takes precedence over flat proxy_urls)
+    proxy_cfg = None
+    if config.proxy_tiered:
+        proxy_cfg = ProxyConfiguration(tiered_proxy_urls=config.proxy_tiered)
+        logger.info(f"Using tiered proxy with {len(config.proxy_tiered)} tier(s)")
+    elif config.proxy_urls:
+        proxy_cfg = ProxyConfiguration(proxy_urls=config.proxy_urls)
+        logger.info(f"Using {len(config.proxy_urls)} proxy URL(s), rotation: {config.proxy_rotation}")
+        if config.proxy_rotation == "until_failure":
+            logger.warning(
+                "proxy_rotation 'until_failure' uses round-robin rotation; "
+                "full sticky-session behavior requires Crawlee SessionPool integration"
+            )
+    # Build browser options
+    browser_launch_options = _build_browser_launch_options(config)
+    browser_context_options = _build_browser_context_options(config)
+    # Build crawler kwargs
+    crawler_kwargs: dict[str, Any] = {
+        "headless": config.headless,
+        "browser_type": config.launcher,
+        "browser_launch_options": browser_launch_options,
+        "max_requests_per_crawl": config.max_pages if config.max_pages > 0 else None,
+        "max_request_retries": config.max_retries,
+        "request_handler_timeout": timedelta(seconds=config.page_load_timeout),
+        "concurrency_settings": ConcurrencySettings(
+            max_concurrency=config.max_concurrency,
+            desired_concurrency=min(10, config.max_concurrency),
+        ),
+        "respect_robots_txt_file": config.respect_robots_txt,
+        "max_crawl_depth": config.crawl_depth if config.crawl_depth > 0 else None,
+        "goto_options": GotoOptions(wait_until=config.wait_until),
+    }
+    if proxy_cfg:
+        crawler_kwargs["proxy_configuration"] = proxy_cfg
+    if browser_context_options:
+        crawler_kwargs["browser_new_context_options"] = browser_context_options
+    crawler = PlaywrightCrawler(**crawler_kwargs)
+    @crawler.router.default_handler
+    async def handler(context: PlaywrightCrawlingContext) -> None:
+        nonlocal pages_extracted
+        url = context.request.url
+        logger.info(f"Processing {url}")
+        # Check max results limit
+        if max_results > 0 and pages_extracted >= max_results:
+            logger.info(f"Reached max results limit ({max_results}), skipping {url}")
+            return
+        # Auto-dismiss cookie modals (CMP-aware)
+        if config.close_cookie_modals:
+            try:
+                await context.page.evaluate("""
+                    () => {
+                        // 1. Didomi CMP
+                        if (window.Didomi) {
+                            try { window.Didomi.setUserAgreeToAll(); return; } catch {}
+                        }
+                        // 2. OneTrust
+                        const onetrust = document.querySelector('#onetrust-accept-btn-handler');
+                        if (onetrust) { onetrust.click(); return; }
+                        // 3. CookieBot
+                        const cookiebot = document.querySelector('#CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll');
+                        if (cookiebot) { cookiebot.click(); return; }
+                        // 4. Quantcast / TCF
+                        const quantcast = document.querySelector('.qc-cmp2-summary-buttons button[mode="primary"]');
+                        if (quantcast) { quantcast.click(); return; }
+                        // 5. Generic fallback selectors
+                        const selectors = [
+                            '[class*="cookie"] button', '[id*="cookie"] button',
+                            '[class*="consent"] button', '[id*="consent"] button',
+                            'button[class*="accept"]', 'button[id*="accept"]',
+                        ];
+                        for (const sel of selectors) {
+                            const btn = document.querySelector(sel);
+                            if (btn) { btn.click(); return; }
+                        }
+                    }
+                """)
+                await context.page.wait_for_timeout(1000)
+            except Exception:
+                pass  # Best effort
+        # Scroll page to load dynamic content
+        if config.max_scroll_height > 0:
+            try:
+                await context.page.evaluate(f"""
+                    async () => {{
+                        let scrolled = 0;
+                        const maxScroll = {config.max_scroll_height};
+                        while (scrolled < maxScroll) {{
+                            window.scrollBy(0, 500);
+                            scrolled += 500;
+                            await new Promise(r => setTimeout(r, 100));
+                        }}
+                        window.scrollTo(0, 0);
+                    }}
+                """)
+            except Exception:
+                pass  # Best effort
+        html = await context.page.content()
+        slug = _url_to_filename(url)
+        # Extract metadata for text-based format headers
+        metadata = extractor.extract_metadata(html, url=url)
+        def _build_text_content(raw_content: str, fmt: str) -> str:
+            """Prepend metadata header for markdown/txt formats."""
+            parts: list[str] = []
+            if (metadata.title or metadata.author or metadata.date) and fmt in ("markdown", "txt"):
+                if metadata.title:
+                    parts.append(f"Title: {metadata.title}")
+                if metadata.author:
+                    parts.append(f"Author: {metadata.author}")
+                if metadata.date:
+                    parts.append(f"Date: {metadata.date}")
+                parts.append(f"URL: {url}")
+                parts.append("")
+                parts.append("---")
+                parts.append("")
+            parts.append(raw_content)
+            return "\n".join(parts)
+        # Track whether any content was extracted
+        any_saved = False
+        # Save each enabled format
+        if config.save_markdown:
+            result = extractor.extract(html, url=url, output_format="markdown")
+            if result:
+                content = _build_text_content(result.content, "markdown")
+                filepath = output_dir / f"{slug}.md"
+                filepath.write_text(content, encoding="utf-8")
+                logger.info(f"Saved {filepath}")
+                any_saved = True
+        if config.save_text:
+            result = extractor.extract(html, url=url, output_format="txt")
+            if result:
+                content = _build_text_content(result.content, "txt")
+                filepath = output_dir / f"{slug}.txt"
+                filepath.write_text(content, encoding="utf-8")
+                logger.info(f"Saved {filepath}")
+                any_saved = True
+        if config.save_json:
+            result = extractor.extract(html, url=url, output_format="json")
+            if result:
+                filepath = output_dir / f"{slug}.json"
+                filepath.write_text(result.content, encoding="utf-8")
+                logger.info(f"Saved {filepath}")
+                any_saved = True
+        if config.save_jsonl:
+            result = extractor.extract(html, url=url, output_format="markdown")
+            if result:
+                jsonl_path = output_dir / "output.jsonl"
+                entry = {
+                    "url": url,
+                    "title": metadata.title or "",
+                    "author": metadata.author or "",
+                    "date": metadata.date or "",
+                    "content": result.content,
+                }
+                with open(jsonl_path, "a", encoding="utf-8") as f:
+                    f.write(json.dumps(entry, ensure_ascii=False) + "\n")
+                logger.info(f"Appended to {jsonl_path}")
+                any_saved = True
+        if config.save_xml:
+            result = extractor.extract(html, url=url, output_format="xml")
+            if result:
+                filepath = output_dir / f"{slug}.xml"
+                filepath.write_text(result.content, encoding="utf-8")
+                logger.info(f"Saved {filepath}")
+                any_saved = True
+        if config.save_xml_tei:
+            result = extractor.extract(html, url=url, output_format="xmltei")
+            if result:
+                filepath = output_dir / f"{slug}.tei.xml"
+                filepath.write_text(result.content, encoding="utf-8")
+                logger.info(f"Saved {filepath}")
+                any_saved = True
+        if config.save_raw_html:
+            filepath = output_dir / f"{slug}.html"
+            filepath.write_text(html, encoding="utf-8")
+            logger.info(f"Saved {filepath}")
+            any_saved = True
+        if not any_saved:
+            logger.warning(f"No content extracted from {url}")
+            return
+        pages_extracted += 1
+        # Enqueue links if crawl depth is configured (Crawlee handles depth limiting natively)
+        if config.crawl_depth > 0:
+            enqueue_kwargs: dict[str, Any] = {}
+            if config.link_selector:
+                enqueue_kwargs["selector"] = config.link_selector
+            if config.globs:
+                enqueue_kwargs["globs"] = config.globs
+            if config.excludes:
+                enqueue_kwargs["exclude_globs"] = config.excludes
+            await context.enqueue_links(**enqueue_kwargs)
+    # Build requests
+    requests = [
+        Request.from_url(
+            url,
+            keep_url_fragment=config.keep_url_fragments,
+        )
+        for url in config.urls
+    ]
+    await crawler.run(requests)
+    logger.info(f"Done. Extracted {pages_extracted} pages to {output_dir}")

contextractor_cli/main.py ADDED Viewed

@@ -0,0 +1,333 @@
+"""CLI entry point using Typer."""
+from __future__ import annotations
+import asyncio
+import logging
+from pathlib import Path
+from typing import Annotated, Optional
+import typer
+from .config import CrawlConfig
+from .crawler import run_crawl
+app = typer.Typer(
+    name="contextractor",
+    help="Extract web content from URLs using configurable extraction options.",
+)
+@app.command()
+def extract(
+    urls: Annotated[
+        Optional[list[str]],
+        typer.Argument(help="URLs to extract content from"),
+    ] = None,
+    # -- Config file --
+    config: Annotated[
+        Optional[Path],
+        typer.Option("--config", "-c", help="Path to JSON config file",
+                     exists=True, readable=True),
+    ] = None,
+    # -- CrawlConfig fields --
+    max_pages: Annotated[
+        Optional[int],
+        typer.Option("--max-pages", help="Max pages to crawl (0 = unlimited)"),
+    ] = None,
+    crawl_depth: Annotated[
+        Optional[int],
+        typer.Option("--crawl-depth", help="Max link depth from start URLs (0 = start only)"),
+    ] = None,
+    headless: Annotated[
+        Optional[bool],
+        typer.Option("--headless/--no-headless", help="Run browser in headless mode"),
+    ] = None,
+    output_dir: Annotated[
+        Optional[str],
+        typer.Option("--output-dir", "-o", help="Output directory"),
+    ] = None,
+    # -- Proxy --
+    proxy_urls: Annotated[
+        Optional[str],
+        typer.Option("--proxy-urls",
+                     help="Comma-separated proxy URLs (http://user:pass@host:port)"),
+    ] = None,
+    proxy_rotation: Annotated[
+        Optional[str],
+        typer.Option("--proxy-rotation",
+                     help="Proxy rotation: recommended, per_request, until_failure"),
+    ] = None,
+    # -- Browser settings --
+    launcher: Annotated[
+        Optional[str],
+        typer.Option("--launcher", help="Browser engine: chromium, firefox"),
+    ] = None,
+    wait_until: Annotated[
+        Optional[str],
+        typer.Option("--wait-until",
+                     help="Page load event: networkidle, load, domcontentloaded"),
+    ] = None,
+    page_load_timeout: Annotated[
+        Optional[int],
+        typer.Option("--page-load-timeout", help="Page load timeout in seconds"),
+    ] = None,
+    ignore_cors: Annotated[
+        Optional[bool],
+        typer.Option("--ignore-cors", help="Disable CORS/CSP restrictions"),
+    ] = None,
+    close_cookie_modals: Annotated[
+        Optional[bool],
+        typer.Option("--close-cookie-modals", help="Auto-dismiss cookie banners"),
+    ] = None,
+    max_scroll_height: Annotated[
+        Optional[int],
+        typer.Option("--max-scroll-height", help="Max scroll height in pixels"),
+    ] = None,
+    ignore_ssl_errors: Annotated[
+        Optional[bool],
+        typer.Option("--ignore-ssl-errors", help="Skip SSL certificate verification"),
+    ] = None,
+    user_agent: Annotated[
+        Optional[str],
+        typer.Option("--user-agent", help="Custom User-Agent string"),
+    ] = None,
+    # -- Crawl filtering --
+    globs: Annotated[
+        Optional[str],
+        typer.Option("--globs", help="Comma-separated glob patterns to include"),
+    ] = None,
+    excludes: Annotated[
+        Optional[str],
+        typer.Option("--excludes", help="Comma-separated glob patterns to exclude"),
+    ] = None,
+    link_selector: Annotated[
+        Optional[str],
+        typer.Option("--link-selector", help="CSS selector for links to follow"),
+    ] = None,
+    keep_url_fragments: Annotated[
+        Optional[bool],
+        typer.Option("--keep-url-fragments", help="Preserve URL fragments"),
+    ] = None,
+    respect_robots_txt: Annotated[
+        Optional[bool],
+        typer.Option("--respect-robots-txt", help="Honor robots.txt"),
+    ] = None,
+    # -- Cookies & headers --
+    cookies: Annotated[
+        Optional[str],
+        typer.Option("--cookies", help="JSON array of cookie objects"),
+    ] = None,
+    headers: Annotated[
+        Optional[str],
+        typer.Option("--headers", help="JSON object of custom HTTP headers"),
+    ] = None,
+    # -- Concurrency & retries --
+    max_concurrency: Annotated[
+        Optional[int],
+        typer.Option("--max-concurrency", help="Max parallel requests"),
+    ] = None,
+    max_retries: Annotated[
+        Optional[int],
+        typer.Option("--max-retries", help="Max request retries"),
+    ] = None,
+    max_results: Annotated[
+        Optional[int],
+        typer.Option("--max-results", help="Max results per crawl (0 = unlimited)"),
+    ] = None,
+    # -- Output toggles --
+    save_markdown: Annotated[
+        Optional[bool],
+        typer.Option("--save-markdown/--no-save-markdown",
+                     help="Save extracted markdown (default: true)"),
+    ] = None,
+    save_raw_html: Annotated[
+        Optional[bool],
+        typer.Option("--save-raw-html", help="Save raw HTML to output"),
+    ] = None,
+    save_text: Annotated[
+        Optional[bool],
+        typer.Option("--save-text", help="Save extracted text"),
+    ] = None,
+    save_json: Annotated[
+        Optional[bool],
+        typer.Option("--save-json", help="Save extracted JSON"),
+    ] = None,
+    save_jsonl: Annotated[
+        Optional[bool],
+        typer.Option("--save-jsonl", help="Save all pages as JSONL (single file)"),
+    ] = None,
+    save_xml: Annotated[
+        Optional[bool],
+        typer.Option("--save-xml", help="Save extracted XML"),
+    ] = None,
+    save_xml_tei: Annotated[
+        Optional[bool],
+        typer.Option("--save-xml-tei", help="Save extracted XML-TEI"),
+    ] = None,
+    # -- TrafilaturaConfig fields --
+    precision: Annotated[
+        Optional[bool],
+        typer.Option("--precision", help="High precision mode (less noise)"),
+    ] = None,
+    recall: Annotated[
+        Optional[bool],
+        typer.Option("--recall", help="High recall mode (more content)"),
+    ] = None,
+    fast: Annotated[
+        Optional[bool],
+        typer.Option("--fast", help="Fast extraction mode (less thorough)"),
+    ] = None,
+    no_links: Annotated[
+        Optional[bool],
+        typer.Option("--no-links", help="Exclude links from output"),
+    ] = None,
+    no_comments: Annotated[
+        Optional[bool],
+        typer.Option("--no-comments", help="Exclude comments from output"),
+    ] = None,
+    include_tables: Annotated[
+        Optional[bool],
+        typer.Option("--include-tables/--no-tables", help="Include tables in output"),
+    ] = None,
+    include_images: Annotated[
+        Optional[bool],
+        typer.Option("--include-images", help="Include image descriptions"),
+    ] = None,
+    include_formatting: Annotated[
+        Optional[bool],
+        typer.Option("--include-formatting/--no-formatting",
+                     help="Preserve text formatting"),
+    ] = None,
+    deduplicate: Annotated[
+        Optional[bool],
+        typer.Option("--deduplicate", help="Deduplicate extracted content"),
+    ] = None,
+    target_language: Annotated[
+        Optional[str],
+        typer.Option("--target-language", help="Filter by language (e.g. 'en')"),
+    ] = None,
+    with_metadata: Annotated[
+        Optional[bool],
+        typer.Option("--with-metadata/--no-metadata",
+                     help="Extract metadata along with content"),
+    ] = None,
+    prune_xpath: Annotated[
+        Optional[list[str]],
+        typer.Option("--prune-xpath", help="XPath patterns to remove from content"),
+    ] = None,
+    # -- Diagnostics --
+    verbose: Annotated[
+        bool,
+        typer.Option("--verbose", "-v", help="Enable verbose logging"),
+    ] = False,
+) -> None:
+    """Extract content from web pages."""
+    import json as json_mod
+    # Set up logging
+    log_level = logging.DEBUG if verbose else logging.INFO
+    logging.basicConfig(
+        level=log_level,
+        format="%(asctime)s [%(levelname)s] %(message)s",
+        datefmt="%H:%M:%S",
+    )
+    # 1. Start with defaults
+    cfg = CrawlConfig()
+    # 2. If config file provided, load and merge
+    if config is not None:
+        file_config = CrawlConfig.from_file(config)
+        # Replace with file-loaded config as base
+        cfg = file_config
+    # 3. Merge CLI args (CLI wins over file)
+    cli_overrides: dict[str, object] = {
+        "max_pages": max_pages,
+        "crawl_depth": crawl_depth,
+        "headless": headless,
+        "output_dir": output_dir,
+        # Proxy
+        "proxy_urls": [u.strip() for u in proxy_urls.split(",")] if proxy_urls else None,
+        "proxy_rotation": proxy_rotation,
+        # Browser
+        "launcher": launcher.lower() if launcher else None,
+        "wait_until": wait_until.lower() if wait_until else None,
+        "page_load_timeout": page_load_timeout,
+        "ignore_cors": ignore_cors,
+        "close_cookie_modals": close_cookie_modals,
+        "max_scroll_height": max_scroll_height,
+        "ignore_ssl_errors": ignore_ssl_errors,
+        "user_agent": user_agent,
+        # Crawl filtering
+        "globs": [g.strip() for g in globs.split(",")] if globs else None,
+        "excludes": [e.strip() for e in excludes.split(",")] if excludes else None,
+        "link_selector": link_selector,
+        "keep_url_fragments": keep_url_fragments,
+        "respect_robots_txt": respect_robots_txt,
+        # Cookies & headers
+        "cookies": json_mod.loads(cookies) if cookies else None,
+        "headers": json_mod.loads(headers) if headers else None,
+        # Concurrency & retries
+        "max_concurrency": max_concurrency,
+        "max_retries": max_retries,
+        "max_results": max_results,
+        # Output toggles
+        "save_markdown": save_markdown,
+        "save_raw_html": save_raw_html,
+        "save_text": save_text,
+        "save_json": save_json,
+        "save_jsonl": save_jsonl,
+        "save_xml": save_xml,
+        "save_xml_tei": save_xml_tei,
+        # Extraction settings
+        "fast": fast,
+        "favor_precision": precision,
+        "favor_recall": recall,
+        "include_tables": include_tables,
+        "include_images": include_images,
+        "include_formatting": include_formatting,
+        "deduplicate": deduplicate,
+        "target_language": target_language,
+        "with_metadata": with_metadata,
+        "prune_xpath": prune_xpath if prune_xpath else None,
+    }
+    # Handle --no-links and --no-comments (invert to include_*)
+    if no_links:
+        cli_overrides["include_links"] = False
+    if no_comments:
+        cli_overrides["include_comments"] = False
+    cfg.merge(cli_overrides)
+    # 4. URLs from positional args extend/override config urls
+    if urls:
+        cfg.urls = list(urls)
+    # 5. Validate
+    if not cfg.urls:
+        typer.echo("Error: No URLs specified. Provide URLs as arguments or via --config.", err=True)
+        raise typer.Exit(1)
+    # Build list of active output formats for display
+    active_formats = []
+    if cfg.save_markdown:
+        active_formats.append("markdown")
+    if cfg.save_raw_html:
+        active_formats.append("html")
+    if cfg.save_text:
+        active_formats.append("text")
+    if cfg.save_json:
+        active_formats.append("json")
+    if cfg.save_jsonl:
+        active_formats.append("jsonl")
+    if cfg.save_xml:
+        active_formats.append("xml")
+    if cfg.save_xml_tei:
+        active_formats.append("xml-tei")
+    formats_str = ", ".join(active_formats) if active_formats else "markdown"
+    typer.echo(f"Extracting {len(cfg.urls)} URL(s) → {cfg.output_dir}/ ({formats_str})")
+    asyncio.run(run_crawl(cfg))

contextractor_engine/__init__.py ADDED Viewed

@@ -0,0 +1,22 @@
+"""Contextractor Engine - Trafilatura extraction wrapper with configurable options."""
+from typing import Any
+from .extractor import ContentExtractor
+from .models import ExtractionResult, MetadataResult, TrafilaturaConfig
+from .utils import normalize_config_keys
+def get_default_config() -> dict[str, Any]:
+    """Get default TrafilaturaConfig as JSON dict (camelCase keys)."""
+    return TrafilaturaConfig.get_default_json()
+__all__ = [
+    "ContentExtractor",
+    "TrafilaturaConfig",
+    "ExtractionResult",
+    "MetadataResult",
+    "normalize_config_keys",
+    "get_default_config",
+]

contextractor_engine/extractor.py ADDED Viewed

@@ -0,0 +1,70 @@
+"""Content extraction wrapper using trafilatura."""
+import trafilatura
+from .models import ExtractionResult, MetadataResult, TrafilaturaConfig
+class ContentExtractor:
+    """Trafilatura wrapper with configurable extraction."""
+    DEFAULT_FORMATS = ["txt", "markdown", "json", "xml"]
+    def __init__(self, config: TrafilaturaConfig | None = None) -> None:
+        self.config = config or TrafilaturaConfig.balanced()
+    def extract(
+        self,
+        html: str,
+        url: str | None = None,
+        output_format: str = "txt",
+    ) -> ExtractionResult | None:
+        """Extract content in specified format."""
+        kwargs = self.config.to_trafilatura_kwargs()
+        result = trafilatura.extract(
+            html,
+            url=url,
+            output_format=output_format,
+            **kwargs,
+        )
+        if result is None:
+            return None
+        return ExtractionResult(content=result, output_format=output_format)
+    def extract_metadata(self, html: str, url: str | None = None) -> MetadataResult:
+        """Extract metadata from HTML.
+        Note: bare_extraction returns a Document object with attributes,
+        not a dict. Use getattr() to access fields safely.
+        """
+        raw = trafilatura.bare_extraction(html, url=url, with_metadata=True)
+        if not raw:
+            return MetadataResult()  # All fields default to None
+        # bare_extraction returns a Document object with attributes
+        return MetadataResult(
+            title=getattr(raw, "title", None),
+            author=getattr(raw, "author", None),
+            date=getattr(raw, "date", None),
+            description=getattr(raw, "description", None),
+            sitename=getattr(raw, "sitename", None),
+            language=getattr(raw, "language", None),
+        )
+    def extract_all_formats(
+        self,
+        html: str,
+        url: str | None = None,
+        formats: list[str] | None = None,
+    ) -> dict[str, ExtractionResult]:
+        """Extract content in multiple formats at once.
+        Default formats: ["txt", "markdown", "json", "xml"]
+        Returns dict keyed by format name. Failed extractions are omitted.
+        """
+        formats = formats or self.DEFAULT_FORMATS
+        results: dict[str, ExtractionResult] = {}
+        for fmt in formats:
+            result = self.extract(html, url=url, output_format=fmt)
+            if result is not None:
+                results[fmt] = result
+        return results

contextractor_engine/models.py ADDED Viewed

@@ -0,0 +1,126 @@
+"""Data models for contextractor-engine."""
+from dataclasses import dataclass, field, fields
+from typing import Any
+from .utils import normalize_config_keys, to_camel_case
+@dataclass
+class TrafilaturaConfig:
+    """Configuration for trafilatura extraction.
+    Maps all non-deprecated trafilatura.extract() parameters.
+    Excluded (deprecated): no_fallback, as_dict, max_tree_size, settingsfile, config, options.
+    Excluded (per-call): url, record_id, output_format.
+    """
+    fast: bool = False
+    favor_precision: bool = False
+    favor_recall: bool = False
+    include_comments: bool = True
+    include_tables: bool = True
+    include_images: bool = False
+    include_formatting: bool = True
+    include_links: bool = True
+    deduplicate: bool = False
+    target_language: str | None = None
+    with_metadata: bool = True
+    only_with_metadata: bool = False
+    tei_validation: bool = False
+    prune_xpath: str | list[str] | None = None
+    url_blacklist: set[str] | None = field(default=None)
+    author_blacklist: set[str] | None = field(default=None)
+    date_extraction_params: dict[str, Any] | None = None
+    @classmethod
+    def balanced(cls) -> "TrafilaturaConfig":
+        """Default balanced extraction."""
+        return cls()
+    @classmethod
+    def precision(cls) -> "TrafilaturaConfig":
+        """High precision, less noise."""
+        return cls(favor_precision=True)
+    @classmethod
+    def recall(cls) -> "TrafilaturaConfig":
+        """High recall, more content."""
+        return cls(favor_recall=True)
+    def to_trafilatura_kwargs(self) -> dict[str, Any]:
+        """Convert to trafilatura.extract() keyword arguments.
+        Excludes url, record_id, output_format — those are per-call.
+        Only includes optional params if they are set (not None).
+        """
+        return {
+            f.name: getattr(self, f.name)
+            for f in fields(self)
+            if getattr(self, f.name) is not None
+        }
+    def to_json_dict(self) -> dict[str, Any]:
+        """Convert config to JSON-serializable dict with camelCase keys.
+        Used for API responses and GUI defaults.
+        Excludes None values. Sets are converted to lists for JSON compatibility.
+        """
+        result: dict[str, Any] = {}
+        for f in fields(self):
+            value = getattr(self, f.name)
+            if value is None:
+                continue
+            if isinstance(value, set):
+                value = list(value)
+            result[to_camel_case(f.name)] = value
+        return result
+    @classmethod
+    def from_json_dict(cls, data: dict[str, Any] | None) -> "TrafilaturaConfig":
+        """Create config from a camelCase (or snake_case) dict.
+        This is the single canonical way to build a TrafilaturaConfig from
+        external input (JSON, YAML, API). Handles key normalization, None
+        filtering, and type coercion (lists → sets for blacklist fields).
+        Unknown keys are ignored. Returns balanced defaults for empty/None input.
+        """
+        if not data:
+            return cls.balanced()
+        normalized = normalize_config_keys(data)
+        valid_fields = {f.name for f in fields(cls)}
+        kwargs: dict[str, Any] = {}
+        for key, value in normalized.items():
+            if key not in valid_fields or value is None:
+                continue
+            if isinstance(value, list):
+                f = next(f for f in fields(cls) if f.name == key)
+                if "set" in str(f.type):
+                    value = set(value)
+            kwargs[key] = value
+        return cls(**kwargs)
+    @classmethod
+    def get_default_json(cls) -> dict[str, Any]:
+        """Get default config as JSON-serializable dict with camelCase keys."""
+        return cls().to_json_dict()
+@dataclass
+class ExtractionResult:
+    """Result from a single format extraction."""
+    content: str
+    output_format: str  # "txt", "json", "markdown", "xml", "xmltei"
+@dataclass
+class MetadataResult:
+    """Extracted metadata from HTML."""
+    title: str | None = None
+    author: str | None = None
+    date: str | None = None
+    description: str | None = None
+    sitename: str | None = None
+    language: str | None = None

contextractor_engine/py.typed ADDED Viewed

File without changes

contextractor_engine/utils.py ADDED Viewed

@@ -0,0 +1,34 @@
+"""Utility functions for contextractor-engine."""
+import re
+from typing import Any
+def to_snake_case(key: str) -> str:
+    """Convert camelCase to snake_case. Leave snake_case unchanged."""
+    if "_" in key:
+        return key
+    return re.sub(r"(?<!^)(?=[A-Z])", "_", key).lower()
+def to_camel_case(key: str) -> str:
+    """Convert snake_case to camelCase. Leave camelCase unchanged."""
+    parts = key.split("_")
+    return parts[0] + "".join(p.capitalize() for p in parts[1:])
+def normalize_config_keys(config: dict[str, Any]) -> dict[str, Any]:
+    """Normalize config dictionary keys to snake_case.
+    Accepts both camelCase (JSON/API convention) and snake_case (Python convention).
+    Auto-detects the format and converts camelCase to snake_case.
+    Keys already in snake_case are left unchanged.
+    Examples:
+        {"favorPrecision": True} -> {"favor_precision": True}
+        {"favor_precision": True} -> {"favor_precision": True}
+        {"includeLinks": True, "fast": False} -> {"include_links": True, "fast": False}
+    """
+    if not config:
+        return {}
+    return {to_snake_case(k): v for k, v in config.items()}