PyPI - matrx-scraper - Versions diffs - 0.1.0__py3-none-any.whl - Mend

matrx-scraper 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (75) hide show

matrx_scraper/__init__.py +232 -0
matrx_scraper/_ext.py +43 -0
matrx_scraper/ai_browser/__init__.py +112 -0
matrx_scraper/ai_browser/actions.py +573 -0
matrx_scraper/ai_browser/client.py +438 -0
matrx_scraper/ai_browser/session.py +193 -0
matrx_scraper/ai_tools/__init__.py +44 -0
matrx_scraper/ai_tools/specs.py +575 -0
matrx_scraper/api/__init__.py +26 -0
matrx_scraper/api/browser_router.py +326 -0
matrx_scraper/api/ext_router.py +322 -0
matrx_scraper/api/preview_router.py +29 -0
matrx_scraper/api/scrape_router.py +224 -0
matrx_scraper/browser_pool.py +218 -0
matrx_scraper/cache.py +164 -0
matrx_scraper/crawler.py +1051 -0
matrx_scraper/custom_extractors.py +196 -0
matrx_scraper/domain_config.py +315 -0
matrx_scraper/events.py +234 -0
matrx_scraper/extractors.py +73 -0
matrx_scraper/features/__init__.py +15 -0
matrx_scraper/features/extensions.py +61 -0
matrx_scraper/features/mcp_tool_helpers.py +175 -0
matrx_scraper/features/quick_search.py +48 -0
matrx_scraper/features/read_page.py +189 -0
matrx_scraper/features/utils.py +36 -0
matrx_scraper/graph_nodes/__init__.py +44 -0
matrx_scraper/graph_nodes/scrape_actions.py +252 -0
matrx_scraper/graph_nodes/stock_image_actions.py +218 -0
matrx_scraper/gsc_bootstrap.py +262 -0
matrx_scraper/mcp/__init__.py +35 -0
matrx_scraper/mcp/__main__.py +6 -0
matrx_scraper/mcp/server.py +257 -0
matrx_scraper/orchestrator.py +383 -0
matrx_scraper/pagerank.py +104 -0
matrx_scraper/parser/__init__.py +22 -0
matrx_scraper/parser/core.py +386 -0
matrx_scraper/parser/data_types.py +631 -0
matrx_scraper/parser/element_extractor.py +845 -0
matrx_scraper/parser/extraction_rules.py +67 -0
matrx_scraper/parser/flattener.py +441 -0
matrx_scraper/parser/hashing.py +89 -0
matrx_scraper/parser/link_extractor.py +165 -0
matrx_scraper/parser/main_content.py +76 -0
matrx_scraper/parser/noise_config.py +133 -0
matrx_scraper/parser/noise_remover.py +163 -0
matrx_scraper/parser/overrides.py +188 -0
matrx_scraper/parser/scrape_filter.py +229 -0
matrx_scraper/parser/scrape_json_to_text.py +165 -0
matrx_scraper/parser/transform.py +297 -0
matrx_scraper/parser/utils.py +409 -0
matrx_scraper/performance.py +643 -0
matrx_scraper/preview.py +195 -0
matrx_scraper/queue_backend.py +114 -0
matrx_scraper/rate_limiter.py +114 -0
matrx_scraper/recipe_runtime.py +140 -0
matrx_scraper/recipes.py +141 -0
matrx_scraper/scraper.py +788 -0
matrx_scraper/search/__init__.py +18 -0
matrx_scraper/search/brave_client.py +114 -0
matrx_scraper/search/rate_limiter.py +27 -0
matrx_scraper/search/search.py +188 -0
matrx_scraper/seo_audit.py +413 -0
matrx_scraper/server/__init__.py +19 -0
matrx_scraper/server/__main__.py +57 -0
matrx_scraper/server/app.py +181 -0
matrx_scraper/server/config.py +64 -0
matrx_scraper/service.py +424 -0
matrx_scraper/url_utils.py +30 -0
matrx_scraper/utils/__init__.py +8 -0
matrx_scraper/utils/url.py +239 -0
matrx_scraper-0.1.0.dist-info/METADATA +179 -0
matrx_scraper-0.1.0.dist-info/RECORD +75 -0
matrx_scraper-0.1.0.dist-info/WHEEL +4 -0
matrx_scraper-0.1.0.dist-info/entry_points.txt +2 -0

matrx_scraper/__init__.py ADDED Viewed

@@ -0,0 +1,232 @@
+"""matrx-scraper — Web scraping engine, HTML parsing, and search integration.
+The top-level `matrx_scraper` namespace exposes a wide surface (orchestrator,
+crawler, search, parsers, AI browser primitives, etc.). To avoid forcing
+heavy optional dependencies (Playwright, Selenium fallbacks, etc.) on
+consumers that only use a slice of the API (e.g. `matrx_scraper.search` or
+`matrx_scraper.queue_backend` from aidream), we resolve top-level names
+lazily via PEP 562 `__getattr__`.
+Submodule imports (`from matrx_scraper.search import ...`,
+`from matrx_scraper.ai_browser import ...`, etc.) bypass this lazy layer
+and load only what the caller actually needs.
+"""
+from __future__ import annotations
+from typing import TYPE_CHECKING
+_LAZY_IMPORTS: dict[str, str] = {
+    "scrape": "matrx_scraper.orchestrator",
+    "scrape_many": "matrx_scraper.orchestrator",
+    "scrape_many_stream": "matrx_scraper.orchestrator",
+    "ScrapeResult": "matrx_scraper.orchestrator",
+    "ScrapeService": "matrx_scraper.service",
+    "ScrapeOptions": "matrx_scraper.service",
+    "crawl_site": "matrx_scraper.crawler",
+    "SiteCrawler": "matrx_scraper.crawler",
+    "SiteCrawlerConfig": "matrx_scraper.crawler",
+    "CrawlEventSink": "matrx_scraper.crawler",
+    "PersistRequest": "matrx_scraper.crawler",
+    "PersistResult": "matrx_scraper.crawler",
+    "BodyPersister": "matrx_scraper.crawler",
+    "RENDER_HTTP_ONLY": "matrx_scraper.crawler",
+    "RENDER_HTTP_FIRST": "matrx_scraper.crawler",
+    "RENDER_BROWSER_ALWAYS": "matrx_scraper.crawler",
+    "RENDER_BROWSER_WITH_SCREENSHOT": "matrx_scraper.crawler",
+    "VALID_RENDER_MODES": "matrx_scraper.crawler",
+    "QueueBackend": "matrx_scraper.queue_backend",
+    "QueueItem": "matrx_scraper.queue_backend",
+    "InMemoryQueueBackend": "matrx_scraper.queue_backend",
+    "HostRateLimiter": "matrx_scraper.rate_limiter",
+    "audit_html": "matrx_scraper.seo_audit",
+    "SeoAuditResult": "matrx_scraper.seo_audit",
+    "CrawlEvent": "matrx_scraper.events",
+    "CrawlEventType": "matrx_scraper.events",
+    "CrawlStartedEvent": "matrx_scraper.events",
+    "CrawlPageDiscoveredEvent": "matrx_scraper.events",
+    "CrawlPageFetchedEvent": "matrx_scraper.events",
+    "CrawlPageParsedEvent": "matrx_scraper.events",
+    "CrawlPageFailedEvent": "matrx_scraper.events",
+    "CrawlProgressEvent": "matrx_scraper.events",
+    "CrawlIssueDetectedEvent": "matrx_scraper.events",
+    "CrawlWarningEvent": "matrx_scraper.events",
+    "CrawlCompletedEvent": "matrx_scraper.events",
+    "PageSummary": "matrx_scraper.events",
+    "parse_html": "matrx_scraper.parser",
+    "ParserOrchestrator": "matrx_scraper.parser",
+    "LinkExtractor": "matrx_scraper.parser.link_extractor",
+    "NoiseRemover": "matrx_scraper.parser.noise_remover",
+    "NoiseRemoverConfig": "matrx_scraper.parser.noise_config",
+    "MainContentFinder": "matrx_scraper.parser.main_content",
+    "compute_hashes": "matrx_scraper.parser.hashing",
+    "compute_minhash_from_text": "matrx_scraper.parser.hashing",
+    "compute_simhash": "matrx_scraper.parser.hashing",
+    "BraveSearchClient": "matrx_scraper.search",
+    "async_brave_search": "matrx_scraper.search",
+    "CacheBackend": "matrx_scraper.cache",
+    "MemoryCache": "matrx_scraper.cache",
+    "TwoTierCache": "matrx_scraper.cache",
+    "DomainConfigBackend": "matrx_scraper.domain_config",
+    "PostgresDomainConfigStore": "matrx_scraper.domain_config",
+    "StaticDomainConfigStore": "matrx_scraper.domain_config",
+    "PlaywrightBrowserPool": "matrx_scraper.browser_pool",
+    "URLInfo": "matrx_scraper.utils",
+    "get_url_info": "matrx_scraper.utils",
+    "normalize_url": "matrx_scraper.url_utils",
+    "compute_link_scores": "matrx_scraper.pagerank",
+    "PageRankEdge": "matrx_scraper.pagerank",
+    "CustomExtractor": "matrx_scraper.custom_extractors",
+    "find_extractors_for_url": "matrx_scraper.custom_extractors",
+    "run_custom_extractors": "matrx_scraper.custom_extractors",
+    "run_custom_extractor": "matrx_scraper.custom_extractors",
+    "CrawlRecipe": "matrx_scraper.recipes",
+    "RecipeAction": "matrx_scraper.recipes",
+    "RecipeBackend": "matrx_scraper.recipes",
+    "StaticRecipeBackend": "matrx_scraper.recipes",
+    "DEFAULT_RECIPES": "matrx_scraper.recipes",
+    "CapturedScreenshot": "matrx_scraper.recipe_runtime",
+    "capture_screenshots": "matrx_scraper.recipe_runtime",
+    "execute_actions": "matrx_scraper.recipe_runtime",
+    "PsiClient": "matrx_scraper.performance",
+    "PsiSnapshot": "matrx_scraper.performance",
+    "GscClient": "matrx_scraper.performance",
+    "GscPageSnapshot": "matrx_scraper.performance",
+    "GscQueryRow": "matrx_scraper.performance",
+    "quick_preview": "matrx_scraper.preview",
+    "BrowserSession": "matrx_scraper.ai_browser",
+    "BrowserSessionManager": "matrx_scraper.ai_browser",
+    "get_browser_session_manager": "matrx_scraper.ai_browser",
+    "RemoteBrowserClient": "matrx_scraper.ai_browser",
+    "BrowserClientError": "matrx_scraper.ai_browser",
+    "ToolSpec": "matrx_scraper.ai_tools",
+    "BROWSER_TOOLS": "matrx_scraper.ai_tools",
+    "SCRAPE_TOOLS": "matrx_scraper.ai_tools",
+    "CRAWL_TOOLS": "matrx_scraper.ai_tools",
+    "ALL_TOOLS": "matrx_scraper.ai_tools",
+}
+def __getattr__(name: str):
+    """PEP 562 module-level lazy attribute resolution."""
+    module_path = _LAZY_IMPORTS.get(name)
+    if module_path is None:
+        raise AttributeError(f"module 'matrx_scraper' has no attribute {name!r}")
+    import importlib
+    module = importlib.import_module(module_path)
+    value = getattr(module, name)
+    globals()[name] = value
+    return value
+def __dir__() -> list[str]:
+    return sorted(set(globals()) | set(_LAZY_IMPORTS))
+if TYPE_CHECKING:
+    from matrx_scraper.ai_browser import (
+        BrowserClientError,
+        BrowserSession,
+        BrowserSessionManager,
+        RemoteBrowserClient,
+        get_browser_session_manager,
+    )
+    from matrx_scraper.ai_tools import (
+        ALL_TOOLS,
+        BROWSER_TOOLS,
+        CRAWL_TOOLS,
+        SCRAPE_TOOLS,
+        ToolSpec,
+    )
+    from matrx_scraper.browser_pool import PlaywrightBrowserPool
+    from matrx_scraper.cache import CacheBackend, MemoryCache, TwoTierCache
+    from matrx_scraper.crawler import (
+        RENDER_BROWSER_ALWAYS,
+        RENDER_BROWSER_WITH_SCREENSHOT,
+        RENDER_HTTP_FIRST,
+        RENDER_HTTP_ONLY,
+        VALID_RENDER_MODES,
+        BodyPersister,
+        CrawlEventSink,
+        PersistRequest,
+        PersistResult,
+        SiteCrawler,
+        SiteCrawlerConfig,
+        crawl_site,
+    )
+    from matrx_scraper.custom_extractors import (
+        Extractor as CustomExtractor,
+        find_for_url as find_extractors_for_url,
+        run_all as run_custom_extractors,
+        run_extractor as run_custom_extractor,
+    )
+    from matrx_scraper.domain_config import (
+        DomainConfigBackend,
+        PostgresDomainConfigStore,
+        StaticDomainConfigStore,
+    )
+    from matrx_scraper.events import (
+        CrawlCompletedEvent,
+        CrawlEvent,
+        CrawlEventType,
+        CrawlIssueDetectedEvent,
+        CrawlPageDiscoveredEvent,
+        CrawlPageFailedEvent,
+        CrawlPageFetchedEvent,
+        CrawlPageParsedEvent,
+        CrawlProgressEvent,
+        CrawlStartedEvent,
+        CrawlWarningEvent,
+        PageSummary,
+    )
+    from matrx_scraper.orchestrator import (
+        ScrapeResult,
+        scrape,
+        scrape_many,
+        scrape_many_stream,
+    )
+    from matrx_scraper.pagerank import Edge as PageRankEdge, compute_link_scores
+    from matrx_scraper.parser import ParserOrchestrator, parse_html
+    from matrx_scraper.parser.hashing import (
+        compute_hashes,
+        compute_minhash_from_text,
+        compute_simhash,
+    )
+    from matrx_scraper.parser.link_extractor import LinkExtractor
+    from matrx_scraper.parser.main_content import MainContentFinder
+    from matrx_scraper.parser.noise_config import NoiseRemoverConfig
+    from matrx_scraper.parser.noise_remover import NoiseRemover
+    from matrx_scraper.performance import (
+        GscClient,
+        GscPageSnapshot,
+        GscQueryRow,
+        PsiClient,
+        PsiSnapshot,
+    )
+    from matrx_scraper.preview import quick_preview
+    from matrx_scraper.queue_backend import (
+        InMemoryQueueBackend,
+        QueueBackend,
+        QueueItem,
+    )
+    from matrx_scraper.rate_limiter import HostRateLimiter
+    from matrx_scraper.recipe_runtime import (
+        CapturedScreenshot,
+        capture_screenshots,
+        execute_actions,
+    )
+    from matrx_scraper.recipes import (
+        DEFAULT_RECIPES,
+        CrawlRecipe,
+        RecipeAction,
+        RecipeBackend,
+        StaticRecipeBackend,
+    )
+    from matrx_scraper.search import BraveSearchClient, async_brave_search
+    from matrx_scraper.seo_audit import SeoAuditResult, audit_html
+    from matrx_scraper.service import ScrapeOptions, ScrapeService
+    from matrx_scraper.url_utils import normalize_url
+    from matrx_scraper.utils import URLInfo, get_url_info
+__all__ = sorted(_LAZY_IMPORTS)

matrx_scraper/_ext.py ADDED Viewed

@@ -0,0 +1,43 @@
+"""
+External Dependency Registry for matrx-scraper package.
+Provides a configuration-based approach for injecting external dependencies
+that come from the host application (e.g., search functions).
+Usage (host application startup):
+    from matrx_scraper._ext import configure_ext
+    configure_ext(wrapped_brave_search=wrapped_brave_search)
+Usage (within matrx-scraper package):
+    from matrx_scraper._ext import get_ext
+    wrapped_brave_search = get_ext("wrapped_brave_search")
+"""
+from __future__ import annotations
+from typing import Any
+_registry: dict[str, Any] = {}
+class ExtNotConfiguredError(RuntimeError):
+    pass
+def configure_ext(**kwargs: Any) -> None:
+    _registry.update(kwargs)
+def get_ext(name: str) -> Any:
+    if name not in _registry:
+        raise ExtNotConfiguredError(
+            f"matrx-scraper external dependency '{name}' not registered. "
+            f"Call matrx_scraper.configure() before using this functionality."
+        )
+    return _registry[name]
+def has_ext(name: str) -> bool:
+    return name in _registry

matrx_scraper/ai_browser/__init__.py ADDED Viewed

@@ -0,0 +1,112 @@
+"""AI-callable browser automation surface.
+This module provides the **pure** browser-control primitives that any AI tool
+runtime (matrx-ai, MCP server, plain async functions) can build on top of.
+Everything here:
+    * Owns its own Playwright lifecycle (no host dependency).
+    * Returns plain dicts and dataclasses (no matrx-ai or matrx-connect imports).
+    * Is JSON-serialisable — every value can flow through an MCP transport.
+    * Is async-first.
+The shape mirrors the 9 server-side browser tools that matrx-ai already exposes
+(`browser_navigate` / `_click` / `_type_text` / `_select_option` / `_screenshot`
+/ `_wait_for` / `_get_element` / `_scroll` / `_close`) plus a handful of
+extractors that the AI loop tends to need when scraping fails:
+    * navigate → open or reuse a session, return final URL + title + (optional) text
+    * click   → CSS click; optional wait_after_ms
+    * fill    → set value of an <input>/<textarea>
+    * type    → type into a focused field, optional press-enter-and-wait
+    * select_option → set a <select> by value or label
+    * screenshot → PNG bytes (full page, viewport, or element)
+    * wait_for → wait for selector or visible text
+    * get_element → query an element's attrs, text, html
+    * get_html → page.content()  (post-recipe, post-JS)
+    * get_text → body innerText, capped
+    * query_selectors → bulk pull text/attrs across many selectors at once
+    * eval_js → page.evaluate() — locked-down by default; enable with allow_eval_js
+    * scroll → page or element scroll
+    * close → release a session
+Everything else (scraping, crawling, recipes) is composed on top of this.
+"""
+from __future__ import annotations
+from matrx_scraper.ai_browser.session import (
+    BrowserSession,
+    BrowserSessionManager,
+    get_browser_session_manager,
+)
+from matrx_scraper.ai_browser.client import (
+    RemoteBrowserClient,
+    BrowserClientError,
+)
+from matrx_scraper.ai_browser.actions import (
+    NavigateResult,
+    ClickResult,
+    FillResult,
+    TypeResult,
+    SelectOptionResult,
+    ScreenshotResult,
+    WaitForResult,
+    GetElementResult,
+    QuerySelectorsResult,
+    EvalJsResult,
+    ScrollResult,
+    GetHtmlResult,
+    GetTextResult,
+    navigate,
+    click,
+    fill,
+    type_text,
+    select_option,
+    screenshot,
+    wait_for,
+    get_element,
+    query_selectors,
+    eval_js,
+    scroll,
+    get_html,
+    get_text,
+    close as close_session,
+)
+__all__ = [
+    # session
+    "BrowserSession",
+    "BrowserSessionManager",
+    "get_browser_session_manager",
+    # remote client (HTTP) — hosts that don't run Playwright themselves
+    "RemoteBrowserClient",
+    "BrowserClientError",
+    # action results
+    "NavigateResult",
+    "ClickResult",
+    "FillResult",
+    "TypeResult",
+    "SelectOptionResult",
+    "ScreenshotResult",
+    "WaitForResult",
+    "GetElementResult",
+    "QuerySelectorsResult",
+    "EvalJsResult",
+    "ScrollResult",
+    "GetHtmlResult",
+    "GetTextResult",
+    # actions
+    "navigate",
+    "click",
+    "fill",
+    "type_text",
+    "select_option",
+    "screenshot",
+    "wait_for",
+    "get_element",
+    "query_selectors",
+    "eval_js",
+    "scroll",
+    "get_html",
+    "get_text",
+    "close_session",
+]