matrx-scraper 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. matrx_scraper/__init__.py +232 -0
  2. matrx_scraper/_ext.py +43 -0
  3. matrx_scraper/ai_browser/__init__.py +112 -0
  4. matrx_scraper/ai_browser/actions.py +573 -0
  5. matrx_scraper/ai_browser/client.py +438 -0
  6. matrx_scraper/ai_browser/session.py +193 -0
  7. matrx_scraper/ai_tools/__init__.py +44 -0
  8. matrx_scraper/ai_tools/specs.py +575 -0
  9. matrx_scraper/api/__init__.py +26 -0
  10. matrx_scraper/api/browser_router.py +326 -0
  11. matrx_scraper/api/ext_router.py +322 -0
  12. matrx_scraper/api/preview_router.py +29 -0
  13. matrx_scraper/api/scrape_router.py +224 -0
  14. matrx_scraper/browser_pool.py +218 -0
  15. matrx_scraper/cache.py +164 -0
  16. matrx_scraper/crawler.py +1051 -0
  17. matrx_scraper/custom_extractors.py +196 -0
  18. matrx_scraper/domain_config.py +315 -0
  19. matrx_scraper/events.py +234 -0
  20. matrx_scraper/extractors.py +73 -0
  21. matrx_scraper/features/__init__.py +15 -0
  22. matrx_scraper/features/extensions.py +61 -0
  23. matrx_scraper/features/mcp_tool_helpers.py +175 -0
  24. matrx_scraper/features/quick_search.py +48 -0
  25. matrx_scraper/features/read_page.py +189 -0
  26. matrx_scraper/features/utils.py +36 -0
  27. matrx_scraper/graph_nodes/__init__.py +44 -0
  28. matrx_scraper/graph_nodes/scrape_actions.py +252 -0
  29. matrx_scraper/graph_nodes/stock_image_actions.py +218 -0
  30. matrx_scraper/gsc_bootstrap.py +262 -0
  31. matrx_scraper/mcp/__init__.py +35 -0
  32. matrx_scraper/mcp/__main__.py +6 -0
  33. matrx_scraper/mcp/server.py +257 -0
  34. matrx_scraper/orchestrator.py +383 -0
  35. matrx_scraper/pagerank.py +104 -0
  36. matrx_scraper/parser/__init__.py +22 -0
  37. matrx_scraper/parser/core.py +386 -0
  38. matrx_scraper/parser/data_types.py +631 -0
  39. matrx_scraper/parser/element_extractor.py +845 -0
  40. matrx_scraper/parser/extraction_rules.py +67 -0
  41. matrx_scraper/parser/flattener.py +441 -0
  42. matrx_scraper/parser/hashing.py +89 -0
  43. matrx_scraper/parser/link_extractor.py +165 -0
  44. matrx_scraper/parser/main_content.py +76 -0
  45. matrx_scraper/parser/noise_config.py +133 -0
  46. matrx_scraper/parser/noise_remover.py +163 -0
  47. matrx_scraper/parser/overrides.py +188 -0
  48. matrx_scraper/parser/scrape_filter.py +229 -0
  49. matrx_scraper/parser/scrape_json_to_text.py +165 -0
  50. matrx_scraper/parser/transform.py +297 -0
  51. matrx_scraper/parser/utils.py +409 -0
  52. matrx_scraper/performance.py +643 -0
  53. matrx_scraper/preview.py +195 -0
  54. matrx_scraper/queue_backend.py +114 -0
  55. matrx_scraper/rate_limiter.py +114 -0
  56. matrx_scraper/recipe_runtime.py +140 -0
  57. matrx_scraper/recipes.py +141 -0
  58. matrx_scraper/scraper.py +788 -0
  59. matrx_scraper/search/__init__.py +18 -0
  60. matrx_scraper/search/brave_client.py +114 -0
  61. matrx_scraper/search/rate_limiter.py +27 -0
  62. matrx_scraper/search/search.py +188 -0
  63. matrx_scraper/seo_audit.py +413 -0
  64. matrx_scraper/server/__init__.py +19 -0
  65. matrx_scraper/server/__main__.py +57 -0
  66. matrx_scraper/server/app.py +181 -0
  67. matrx_scraper/server/config.py +64 -0
  68. matrx_scraper/service.py +424 -0
  69. matrx_scraper/url_utils.py +30 -0
  70. matrx_scraper/utils/__init__.py +8 -0
  71. matrx_scraper/utils/url.py +239 -0
  72. matrx_scraper-0.1.0.dist-info/METADATA +179 -0
  73. matrx_scraper-0.1.0.dist-info/RECORD +75 -0
  74. matrx_scraper-0.1.0.dist-info/WHEEL +4 -0
  75. matrx_scraper-0.1.0.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,232 @@
1
+ """matrx-scraper — Web scraping engine, HTML parsing, and search integration.
2
+
3
+ The top-level `matrx_scraper` namespace exposes a wide surface (orchestrator,
4
+ crawler, search, parsers, AI browser primitives, etc.). To avoid forcing
5
+ heavy optional dependencies (Playwright, Selenium fallbacks, etc.) on
6
+ consumers that only use a slice of the API (e.g. `matrx_scraper.search` or
7
+ `matrx_scraper.queue_backend` from aidream), we resolve top-level names
8
+ lazily via PEP 562 `__getattr__`.
9
+
10
+ Submodule imports (`from matrx_scraper.search import ...`,
11
+ `from matrx_scraper.ai_browser import ...`, etc.) bypass this lazy layer
12
+ and load only what the caller actually needs.
13
+ """
14
+ from __future__ import annotations
15
+
16
+ from typing import TYPE_CHECKING
17
+
18
+ _LAZY_IMPORTS: dict[str, str] = {
19
+ "scrape": "matrx_scraper.orchestrator",
20
+ "scrape_many": "matrx_scraper.orchestrator",
21
+ "scrape_many_stream": "matrx_scraper.orchestrator",
22
+ "ScrapeResult": "matrx_scraper.orchestrator",
23
+ "ScrapeService": "matrx_scraper.service",
24
+ "ScrapeOptions": "matrx_scraper.service",
25
+ "crawl_site": "matrx_scraper.crawler",
26
+ "SiteCrawler": "matrx_scraper.crawler",
27
+ "SiteCrawlerConfig": "matrx_scraper.crawler",
28
+ "CrawlEventSink": "matrx_scraper.crawler",
29
+ "PersistRequest": "matrx_scraper.crawler",
30
+ "PersistResult": "matrx_scraper.crawler",
31
+ "BodyPersister": "matrx_scraper.crawler",
32
+ "RENDER_HTTP_ONLY": "matrx_scraper.crawler",
33
+ "RENDER_HTTP_FIRST": "matrx_scraper.crawler",
34
+ "RENDER_BROWSER_ALWAYS": "matrx_scraper.crawler",
35
+ "RENDER_BROWSER_WITH_SCREENSHOT": "matrx_scraper.crawler",
36
+ "VALID_RENDER_MODES": "matrx_scraper.crawler",
37
+ "QueueBackend": "matrx_scraper.queue_backend",
38
+ "QueueItem": "matrx_scraper.queue_backend",
39
+ "InMemoryQueueBackend": "matrx_scraper.queue_backend",
40
+ "HostRateLimiter": "matrx_scraper.rate_limiter",
41
+ "audit_html": "matrx_scraper.seo_audit",
42
+ "SeoAuditResult": "matrx_scraper.seo_audit",
43
+ "CrawlEvent": "matrx_scraper.events",
44
+ "CrawlEventType": "matrx_scraper.events",
45
+ "CrawlStartedEvent": "matrx_scraper.events",
46
+ "CrawlPageDiscoveredEvent": "matrx_scraper.events",
47
+ "CrawlPageFetchedEvent": "matrx_scraper.events",
48
+ "CrawlPageParsedEvent": "matrx_scraper.events",
49
+ "CrawlPageFailedEvent": "matrx_scraper.events",
50
+ "CrawlProgressEvent": "matrx_scraper.events",
51
+ "CrawlIssueDetectedEvent": "matrx_scraper.events",
52
+ "CrawlWarningEvent": "matrx_scraper.events",
53
+ "CrawlCompletedEvent": "matrx_scraper.events",
54
+ "PageSummary": "matrx_scraper.events",
55
+ "parse_html": "matrx_scraper.parser",
56
+ "ParserOrchestrator": "matrx_scraper.parser",
57
+ "LinkExtractor": "matrx_scraper.parser.link_extractor",
58
+ "NoiseRemover": "matrx_scraper.parser.noise_remover",
59
+ "NoiseRemoverConfig": "matrx_scraper.parser.noise_config",
60
+ "MainContentFinder": "matrx_scraper.parser.main_content",
61
+ "compute_hashes": "matrx_scraper.parser.hashing",
62
+ "compute_minhash_from_text": "matrx_scraper.parser.hashing",
63
+ "compute_simhash": "matrx_scraper.parser.hashing",
64
+ "BraveSearchClient": "matrx_scraper.search",
65
+ "async_brave_search": "matrx_scraper.search",
66
+ "CacheBackend": "matrx_scraper.cache",
67
+ "MemoryCache": "matrx_scraper.cache",
68
+ "TwoTierCache": "matrx_scraper.cache",
69
+ "DomainConfigBackend": "matrx_scraper.domain_config",
70
+ "PostgresDomainConfigStore": "matrx_scraper.domain_config",
71
+ "StaticDomainConfigStore": "matrx_scraper.domain_config",
72
+ "PlaywrightBrowserPool": "matrx_scraper.browser_pool",
73
+ "URLInfo": "matrx_scraper.utils",
74
+ "get_url_info": "matrx_scraper.utils",
75
+ "normalize_url": "matrx_scraper.url_utils",
76
+ "compute_link_scores": "matrx_scraper.pagerank",
77
+ "PageRankEdge": "matrx_scraper.pagerank",
78
+ "CustomExtractor": "matrx_scraper.custom_extractors",
79
+ "find_extractors_for_url": "matrx_scraper.custom_extractors",
80
+ "run_custom_extractors": "matrx_scraper.custom_extractors",
81
+ "run_custom_extractor": "matrx_scraper.custom_extractors",
82
+ "CrawlRecipe": "matrx_scraper.recipes",
83
+ "RecipeAction": "matrx_scraper.recipes",
84
+ "RecipeBackend": "matrx_scraper.recipes",
85
+ "StaticRecipeBackend": "matrx_scraper.recipes",
86
+ "DEFAULT_RECIPES": "matrx_scraper.recipes",
87
+ "CapturedScreenshot": "matrx_scraper.recipe_runtime",
88
+ "capture_screenshots": "matrx_scraper.recipe_runtime",
89
+ "execute_actions": "matrx_scraper.recipe_runtime",
90
+ "PsiClient": "matrx_scraper.performance",
91
+ "PsiSnapshot": "matrx_scraper.performance",
92
+ "GscClient": "matrx_scraper.performance",
93
+ "GscPageSnapshot": "matrx_scraper.performance",
94
+ "GscQueryRow": "matrx_scraper.performance",
95
+ "quick_preview": "matrx_scraper.preview",
96
+ "BrowserSession": "matrx_scraper.ai_browser",
97
+ "BrowserSessionManager": "matrx_scraper.ai_browser",
98
+ "get_browser_session_manager": "matrx_scraper.ai_browser",
99
+ "RemoteBrowserClient": "matrx_scraper.ai_browser",
100
+ "BrowserClientError": "matrx_scraper.ai_browser",
101
+ "ToolSpec": "matrx_scraper.ai_tools",
102
+ "BROWSER_TOOLS": "matrx_scraper.ai_tools",
103
+ "SCRAPE_TOOLS": "matrx_scraper.ai_tools",
104
+ "CRAWL_TOOLS": "matrx_scraper.ai_tools",
105
+ "ALL_TOOLS": "matrx_scraper.ai_tools",
106
+ }
107
+
108
+
109
+ def __getattr__(name: str):
110
+ """PEP 562 module-level lazy attribute resolution."""
111
+ module_path = _LAZY_IMPORTS.get(name)
112
+ if module_path is None:
113
+ raise AttributeError(f"module 'matrx_scraper' has no attribute {name!r}")
114
+ import importlib
115
+
116
+ module = importlib.import_module(module_path)
117
+ value = getattr(module, name)
118
+ globals()[name] = value
119
+ return value
120
+
121
+
122
+ def __dir__() -> list[str]:
123
+ return sorted(set(globals()) | set(_LAZY_IMPORTS))
124
+
125
+
126
+ if TYPE_CHECKING:
127
+ from matrx_scraper.ai_browser import (
128
+ BrowserClientError,
129
+ BrowserSession,
130
+ BrowserSessionManager,
131
+ RemoteBrowserClient,
132
+ get_browser_session_manager,
133
+ )
134
+ from matrx_scraper.ai_tools import (
135
+ ALL_TOOLS,
136
+ BROWSER_TOOLS,
137
+ CRAWL_TOOLS,
138
+ SCRAPE_TOOLS,
139
+ ToolSpec,
140
+ )
141
+ from matrx_scraper.browser_pool import PlaywrightBrowserPool
142
+ from matrx_scraper.cache import CacheBackend, MemoryCache, TwoTierCache
143
+ from matrx_scraper.crawler import (
144
+ RENDER_BROWSER_ALWAYS,
145
+ RENDER_BROWSER_WITH_SCREENSHOT,
146
+ RENDER_HTTP_FIRST,
147
+ RENDER_HTTP_ONLY,
148
+ VALID_RENDER_MODES,
149
+ BodyPersister,
150
+ CrawlEventSink,
151
+ PersistRequest,
152
+ PersistResult,
153
+ SiteCrawler,
154
+ SiteCrawlerConfig,
155
+ crawl_site,
156
+ )
157
+ from matrx_scraper.custom_extractors import (
158
+ Extractor as CustomExtractor,
159
+ find_for_url as find_extractors_for_url,
160
+ run_all as run_custom_extractors,
161
+ run_extractor as run_custom_extractor,
162
+ )
163
+ from matrx_scraper.domain_config import (
164
+ DomainConfigBackend,
165
+ PostgresDomainConfigStore,
166
+ StaticDomainConfigStore,
167
+ )
168
+ from matrx_scraper.events import (
169
+ CrawlCompletedEvent,
170
+ CrawlEvent,
171
+ CrawlEventType,
172
+ CrawlIssueDetectedEvent,
173
+ CrawlPageDiscoveredEvent,
174
+ CrawlPageFailedEvent,
175
+ CrawlPageFetchedEvent,
176
+ CrawlPageParsedEvent,
177
+ CrawlProgressEvent,
178
+ CrawlStartedEvent,
179
+ CrawlWarningEvent,
180
+ PageSummary,
181
+ )
182
+ from matrx_scraper.orchestrator import (
183
+ ScrapeResult,
184
+ scrape,
185
+ scrape_many,
186
+ scrape_many_stream,
187
+ )
188
+ from matrx_scraper.pagerank import Edge as PageRankEdge, compute_link_scores
189
+ from matrx_scraper.parser import ParserOrchestrator, parse_html
190
+ from matrx_scraper.parser.hashing import (
191
+ compute_hashes,
192
+ compute_minhash_from_text,
193
+ compute_simhash,
194
+ )
195
+ from matrx_scraper.parser.link_extractor import LinkExtractor
196
+ from matrx_scraper.parser.main_content import MainContentFinder
197
+ from matrx_scraper.parser.noise_config import NoiseRemoverConfig
198
+ from matrx_scraper.parser.noise_remover import NoiseRemover
199
+ from matrx_scraper.performance import (
200
+ GscClient,
201
+ GscPageSnapshot,
202
+ GscQueryRow,
203
+ PsiClient,
204
+ PsiSnapshot,
205
+ )
206
+ from matrx_scraper.preview import quick_preview
207
+ from matrx_scraper.queue_backend import (
208
+ InMemoryQueueBackend,
209
+ QueueBackend,
210
+ QueueItem,
211
+ )
212
+ from matrx_scraper.rate_limiter import HostRateLimiter
213
+ from matrx_scraper.recipe_runtime import (
214
+ CapturedScreenshot,
215
+ capture_screenshots,
216
+ execute_actions,
217
+ )
218
+ from matrx_scraper.recipes import (
219
+ DEFAULT_RECIPES,
220
+ CrawlRecipe,
221
+ RecipeAction,
222
+ RecipeBackend,
223
+ StaticRecipeBackend,
224
+ )
225
+ from matrx_scraper.search import BraveSearchClient, async_brave_search
226
+ from matrx_scraper.seo_audit import SeoAuditResult, audit_html
227
+ from matrx_scraper.service import ScrapeOptions, ScrapeService
228
+ from matrx_scraper.url_utils import normalize_url
229
+ from matrx_scraper.utils import URLInfo, get_url_info
230
+
231
+
232
+ __all__ = sorted(_LAZY_IMPORTS)
matrx_scraper/_ext.py ADDED
@@ -0,0 +1,43 @@
1
+ """
2
+ External Dependency Registry for matrx-scraper package.
3
+
4
+ Provides a configuration-based approach for injecting external dependencies
5
+ that come from the host application (e.g., search functions).
6
+
7
+ Usage (host application startup):
8
+
9
+ from matrx_scraper._ext import configure_ext
10
+ configure_ext(wrapped_brave_search=wrapped_brave_search)
11
+
12
+ Usage (within matrx-scraper package):
13
+
14
+ from matrx_scraper._ext import get_ext
15
+ wrapped_brave_search = get_ext("wrapped_brave_search")
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ from typing import Any
21
+
22
+ _registry: dict[str, Any] = {}
23
+
24
+
25
+ class ExtNotConfiguredError(RuntimeError):
26
+ pass
27
+
28
+
29
+ def configure_ext(**kwargs: Any) -> None:
30
+ _registry.update(kwargs)
31
+
32
+
33
+ def get_ext(name: str) -> Any:
34
+ if name not in _registry:
35
+ raise ExtNotConfiguredError(
36
+ f"matrx-scraper external dependency '{name}' not registered. "
37
+ f"Call matrx_scraper.configure() before using this functionality."
38
+ )
39
+ return _registry[name]
40
+
41
+
42
+ def has_ext(name: str) -> bool:
43
+ return name in _registry
@@ -0,0 +1,112 @@
1
+ """AI-callable browser automation surface.
2
+
3
+ This module provides the **pure** browser-control primitives that any AI tool
4
+ runtime (matrx-ai, MCP server, plain async functions) can build on top of.
5
+ Everything here:
6
+
7
+ * Owns its own Playwright lifecycle (no host dependency).
8
+ * Returns plain dicts and dataclasses (no matrx-ai or matrx-connect imports).
9
+ * Is JSON-serialisable — every value can flow through an MCP transport.
10
+ * Is async-first.
11
+
12
+ The shape mirrors the 9 server-side browser tools that matrx-ai already exposes
13
+ (`browser_navigate` / `_click` / `_type_text` / `_select_option` / `_screenshot`
14
+ / `_wait_for` / `_get_element` / `_scroll` / `_close`) plus a handful of
15
+ extractors that the AI loop tends to need when scraping fails:
16
+
17
+ * navigate → open or reuse a session, return final URL + title + (optional) text
18
+ * click → CSS click; optional wait_after_ms
19
+ * fill → set value of an <input>/<textarea>
20
+ * type → type into a focused field, optional press-enter-and-wait
21
+ * select_option → set a <select> by value or label
22
+ * screenshot → PNG bytes (full page, viewport, or element)
23
+ * wait_for → wait for selector or visible text
24
+ * get_element → query an element's attrs, text, html
25
+ * get_html → page.content() (post-recipe, post-JS)
26
+ * get_text → body innerText, capped
27
+ * query_selectors → bulk pull text/attrs across many selectors at once
28
+ * eval_js → page.evaluate() — locked-down by default; enable with allow_eval_js
29
+ * scroll → page or element scroll
30
+ * close → release a session
31
+
32
+ Everything else (scraping, crawling, recipes) is composed on top of this.
33
+ """
34
+ from __future__ import annotations
35
+
36
+ from matrx_scraper.ai_browser.session import (
37
+ BrowserSession,
38
+ BrowserSessionManager,
39
+ get_browser_session_manager,
40
+ )
41
+ from matrx_scraper.ai_browser.client import (
42
+ RemoteBrowserClient,
43
+ BrowserClientError,
44
+ )
45
+ from matrx_scraper.ai_browser.actions import (
46
+ NavigateResult,
47
+ ClickResult,
48
+ FillResult,
49
+ TypeResult,
50
+ SelectOptionResult,
51
+ ScreenshotResult,
52
+ WaitForResult,
53
+ GetElementResult,
54
+ QuerySelectorsResult,
55
+ EvalJsResult,
56
+ ScrollResult,
57
+ GetHtmlResult,
58
+ GetTextResult,
59
+ navigate,
60
+ click,
61
+ fill,
62
+ type_text,
63
+ select_option,
64
+ screenshot,
65
+ wait_for,
66
+ get_element,
67
+ query_selectors,
68
+ eval_js,
69
+ scroll,
70
+ get_html,
71
+ get_text,
72
+ close as close_session,
73
+ )
74
+
75
+ __all__ = [
76
+ # session
77
+ "BrowserSession",
78
+ "BrowserSessionManager",
79
+ "get_browser_session_manager",
80
+ # remote client (HTTP) — hosts that don't run Playwright themselves
81
+ "RemoteBrowserClient",
82
+ "BrowserClientError",
83
+ # action results
84
+ "NavigateResult",
85
+ "ClickResult",
86
+ "FillResult",
87
+ "TypeResult",
88
+ "SelectOptionResult",
89
+ "ScreenshotResult",
90
+ "WaitForResult",
91
+ "GetElementResult",
92
+ "QuerySelectorsResult",
93
+ "EvalJsResult",
94
+ "ScrollResult",
95
+ "GetHtmlResult",
96
+ "GetTextResult",
97
+ # actions
98
+ "navigate",
99
+ "click",
100
+ "fill",
101
+ "type_text",
102
+ "select_option",
103
+ "screenshot",
104
+ "wait_for",
105
+ "get_element",
106
+ "query_selectors",
107
+ "eval_js",
108
+ "scroll",
109
+ "get_html",
110
+ "get_text",
111
+ "close_session",
112
+ ]