matrx-scraper 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. matrx_scraper-0.1.0/.env.example +34 -0
  2. matrx_scraper-0.1.0/.gitignore +257 -0
  3. matrx_scraper-0.1.0/CLAUDE.md +104 -0
  4. matrx_scraper-0.1.0/Dockerfile +50 -0
  5. matrx_scraper-0.1.0/GAPS_TO_FIX.md +220 -0
  6. matrx_scraper-0.1.0/LEGACY_AUDIT.md +115 -0
  7. matrx_scraper-0.1.0/MIGRATION_GUIDE.md +366 -0
  8. matrx_scraper-0.1.0/MIGRATION_STATUS.md +159 -0
  9. matrx_scraper-0.1.0/PKG-INFO +179 -0
  10. matrx_scraper-0.1.0/README.md +104 -0
  11. matrx_scraper-0.1.0/SCHEMA.md +616 -0
  12. matrx_scraper-0.1.0/SCRAPER_SERVICE_GAPS.md +208 -0
  13. matrx_scraper-0.1.0/STANDALONE_USAGE.md +626 -0
  14. matrx_scraper-0.1.0/docker-compose.yml +40 -0
  15. matrx_scraper-0.1.0/matrx_scraper/__init__.py +232 -0
  16. matrx_scraper-0.1.0/matrx_scraper/_ext.py +43 -0
  17. matrx_scraper-0.1.0/matrx_scraper/ai_browser/__init__.py +112 -0
  18. matrx_scraper-0.1.0/matrx_scraper/ai_browser/actions.py +573 -0
  19. matrx_scraper-0.1.0/matrx_scraper/ai_browser/client.py +438 -0
  20. matrx_scraper-0.1.0/matrx_scraper/ai_browser/session.py +193 -0
  21. matrx_scraper-0.1.0/matrx_scraper/ai_tools/__init__.py +44 -0
  22. matrx_scraper-0.1.0/matrx_scraper/ai_tools/specs.py +575 -0
  23. matrx_scraper-0.1.0/matrx_scraper/api/__init__.py +26 -0
  24. matrx_scraper-0.1.0/matrx_scraper/api/browser_router.py +326 -0
  25. matrx_scraper-0.1.0/matrx_scraper/api/ext_router.py +322 -0
  26. matrx_scraper-0.1.0/matrx_scraper/api/preview_router.py +29 -0
  27. matrx_scraper-0.1.0/matrx_scraper/api/scrape_router.py +224 -0
  28. matrx_scraper-0.1.0/matrx_scraper/browser_pool.py +218 -0
  29. matrx_scraper-0.1.0/matrx_scraper/cache.py +164 -0
  30. matrx_scraper-0.1.0/matrx_scraper/crawler.py +1051 -0
  31. matrx_scraper-0.1.0/matrx_scraper/custom_extractors.py +196 -0
  32. matrx_scraper-0.1.0/matrx_scraper/domain_config.py +315 -0
  33. matrx_scraper-0.1.0/matrx_scraper/events.py +234 -0
  34. matrx_scraper-0.1.0/matrx_scraper/extractors.py +73 -0
  35. matrx_scraper-0.1.0/matrx_scraper/features/__init__.py +15 -0
  36. matrx_scraper-0.1.0/matrx_scraper/features/extensions.py +61 -0
  37. matrx_scraper-0.1.0/matrx_scraper/features/mcp_tool_helpers.py +175 -0
  38. matrx_scraper-0.1.0/matrx_scraper/features/quick_search.py +48 -0
  39. matrx_scraper-0.1.0/matrx_scraper/features/read_page.py +189 -0
  40. matrx_scraper-0.1.0/matrx_scraper/features/utils.py +36 -0
  41. matrx_scraper-0.1.0/matrx_scraper/graph_nodes/__init__.py +44 -0
  42. matrx_scraper-0.1.0/matrx_scraper/graph_nodes/scrape_actions.py +252 -0
  43. matrx_scraper-0.1.0/matrx_scraper/graph_nodes/stock_image_actions.py +218 -0
  44. matrx_scraper-0.1.0/matrx_scraper/gsc_bootstrap.py +262 -0
  45. matrx_scraper-0.1.0/matrx_scraper/mcp/__init__.py +35 -0
  46. matrx_scraper-0.1.0/matrx_scraper/mcp/__main__.py +6 -0
  47. matrx_scraper-0.1.0/matrx_scraper/mcp/server.py +257 -0
  48. matrx_scraper-0.1.0/matrx_scraper/orchestrator.py +383 -0
  49. matrx_scraper-0.1.0/matrx_scraper/pagerank.py +104 -0
  50. matrx_scraper-0.1.0/matrx_scraper/parser/__init__.py +22 -0
  51. matrx_scraper-0.1.0/matrx_scraper/parser/core.py +386 -0
  52. matrx_scraper-0.1.0/matrx_scraper/parser/data_types.py +631 -0
  53. matrx_scraper-0.1.0/matrx_scraper/parser/element_extractor.py +845 -0
  54. matrx_scraper-0.1.0/matrx_scraper/parser/extraction_rules.py +67 -0
  55. matrx_scraper-0.1.0/matrx_scraper/parser/flattener.py +441 -0
  56. matrx_scraper-0.1.0/matrx_scraper/parser/hashing.py +89 -0
  57. matrx_scraper-0.1.0/matrx_scraper/parser/link_extractor.py +165 -0
  58. matrx_scraper-0.1.0/matrx_scraper/parser/main_content.py +76 -0
  59. matrx_scraper-0.1.0/matrx_scraper/parser/noise_config.py +133 -0
  60. matrx_scraper-0.1.0/matrx_scraper/parser/noise_remover.py +163 -0
  61. matrx_scraper-0.1.0/matrx_scraper/parser/overrides.py +188 -0
  62. matrx_scraper-0.1.0/matrx_scraper/parser/scrape_filter.py +229 -0
  63. matrx_scraper-0.1.0/matrx_scraper/parser/scrape_json_to_text.py +165 -0
  64. matrx_scraper-0.1.0/matrx_scraper/parser/transform.py +297 -0
  65. matrx_scraper-0.1.0/matrx_scraper/parser/utils.py +409 -0
  66. matrx_scraper-0.1.0/matrx_scraper/performance.py +643 -0
  67. matrx_scraper-0.1.0/matrx_scraper/preview.py +195 -0
  68. matrx_scraper-0.1.0/matrx_scraper/queue_backend.py +114 -0
  69. matrx_scraper-0.1.0/matrx_scraper/rate_limiter.py +114 -0
  70. matrx_scraper-0.1.0/matrx_scraper/recipe_runtime.py +140 -0
  71. matrx_scraper-0.1.0/matrx_scraper/recipes.py +141 -0
  72. matrx_scraper-0.1.0/matrx_scraper/scraper.py +788 -0
  73. matrx_scraper-0.1.0/matrx_scraper/search/__init__.py +18 -0
  74. matrx_scraper-0.1.0/matrx_scraper/search/brave_client.py +114 -0
  75. matrx_scraper-0.1.0/matrx_scraper/search/rate_limiter.py +27 -0
  76. matrx_scraper-0.1.0/matrx_scraper/search/search.py +188 -0
  77. matrx_scraper-0.1.0/matrx_scraper/seo_audit.py +413 -0
  78. matrx_scraper-0.1.0/matrx_scraper/server/__init__.py +19 -0
  79. matrx_scraper-0.1.0/matrx_scraper/server/__main__.py +57 -0
  80. matrx_scraper-0.1.0/matrx_scraper/server/app.py +181 -0
  81. matrx_scraper-0.1.0/matrx_scraper/server/config.py +64 -0
  82. matrx_scraper-0.1.0/matrx_scraper/service.py +424 -0
  83. matrx_scraper-0.1.0/matrx_scraper/url_utils.py +30 -0
  84. matrx_scraper-0.1.0/matrx_scraper/utils/__init__.py +8 -0
  85. matrx_scraper-0.1.0/matrx_scraper/utils/url.py +239 -0
  86. matrx_scraper-0.1.0/pyproject.toml +95 -0
  87. matrx_scraper-0.1.0/scripts/release.sh +5 -0
  88. matrx_scraper-0.1.0/tests/test_crawler.py +176 -0
  89. matrx_scraper-0.1.0/tests/test_stock_image_actions.py +93 -0
@@ -0,0 +1,34 @@
1
+ # =============================================================================
2
+ # Matrx Scraper — Standalone Server Configuration
3
+ # =============================================================================
4
+ # Copy this file to .env and fill in the values.
5
+
6
+ # --- Required ---
7
+ DATABASE_URL=postgresql://user:password@host:5432/dbname
8
+ SUPABASE_JWT_SECRET=your-supabase-jwt-secret
9
+
10
+ # --- Optional auth ---
11
+ # ADMIN_API_TOKEN is accepted by AuthMiddleware but the static-token escape
12
+ # hatch was removed in 2026-05 — it is ignored. Leave unset.
13
+ ADMIN_API_TOKEN=
14
+
15
+ # --- Search ---
16
+ BRAVE_API_KEY=
17
+
18
+ # --- Proxy ---
19
+ PROXY_DATACENTER_URL=
20
+ PROXY_RESIDENTIAL_URL=
21
+
22
+ # --- Browser pool ---
23
+ BROWSER_POOL_SIZE=3
24
+ ENABLE_BROWSER_POOL=true
25
+
26
+ # --- Feature toggles ---
27
+ ENABLE_CACHE=true
28
+ ENABLE_DOMAIN_CONFIG=true
29
+
30
+ # --- Server ---
31
+ HOST=0.0.0.0
32
+ PORT=8000
33
+ WORKERS=1
34
+ LOG_LEVEL=info
@@ -0,0 +1,257 @@
1
+ *.pyc
2
+ secrets/
3
+ ignore/
4
+ temp/
5
+ logs/
6
+ todo
7
+ text_notes/
8
+ aidream/secrets/2.env
9
+ automation_matrix/matrix_processing/temp/*
10
+ cd
11
+ # Byte-compiled / optimized / DLL files
12
+ __pycache__/
13
+ *.py[cod]
14
+ *$py.class
15
+
16
+ # C extensions
17
+ *.so
18
+ .venv/
19
+
20
+ # Distribution / packaging
21
+ .Python
22
+ build/
23
+ develop-eggs/
24
+ dist/
25
+ downloads/
26
+ eggs/
27
+ .eggs/
28
+ lib/
29
+ lib64/
30
+ # The blanket lib/ rule above is from the standard Python .gitignore template
31
+ # and was silently swallowing TS source under the SPA `src/lib/` folders.
32
+ # Re-allow them explicitly so frontend builds don't ship without their lib layer.
33
+ !dashboard/src/lib/
34
+ !dashboard/src/lib/**
35
+ !workflow-studio/src/lib/
36
+ !workflow-studio/src/lib/**
37
+ parts/
38
+ sdist/
39
+ var/
40
+ wheels/
41
+ share/python-wheels/
42
+ *.egg-info/
43
+ .installed.cfg
44
+ *.egg
45
+ MANIFEST
46
+
47
+ # PyInstaller
48
+ # Usually these files are written by a python script from a template
49
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
50
+ *.manifest
51
+ *.spec
52
+
53
+ # Installer logs
54
+ pip-log.txt
55
+ pip-delete-this-directory.txt
56
+
57
+ # Unit test / coverage reports
58
+ ai/tests/clean_response.json
59
+ ai/tests/cx_storage_response.json
60
+ ai/tests/execution_test.py
61
+ ai/tests/final_response.json
62
+ htmlcov/
63
+ .tox/
64
+ .nox/
65
+ .coverage
66
+ .coverage.*
67
+ .cache
68
+ nosetests.xml
69
+ coverage.xml
70
+ *.cover
71
+ *.py,cover
72
+ .hypothesis/
73
+ .pytest_cache/
74
+ cover/
75
+
76
+ # Translations
77
+ *.mo
78
+ *.pot
79
+
80
+ # Django stuff:
81
+ *.log
82
+ local_settings.py
83
+ db.sqlite3
84
+ db.sqlite3-journal
85
+
86
+ # Flask stuff:
87
+ instance/
88
+ .webassets-cache
89
+
90
+ # Scrapy stuff:
91
+ .scrapy
92
+
93
+ # Sphinx documentation
94
+ docs/_build/
95
+
96
+ # PyBuilder
97
+ .pybuilder/
98
+ target/
99
+
100
+ # Jupyter Notebook
101
+ .ipynb_checkpoints
102
+
103
+ # IPython
104
+ profile_default/
105
+ ipython_config.py
106
+
107
+ # pyenv
108
+ # For a library or package, you might want to ignore these files since the code is
109
+ # intended to run in multiple environments; otherwise, check them in:
110
+ # .python-version
111
+
112
+ # pipenv
113
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
114
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
115
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
116
+ # install all needed dependencies.
117
+ #Pipfile.lock
118
+
119
+ # poetry
120
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
121
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
122
+ # commonly ignored for libraries.
123
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
124
+
125
+ # pdm
126
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
127
+ #pdm.lock
128
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
129
+ # in version control.
130
+ # https://pdm.fming.dev/#use-with-ide
131
+ .pdm.toml
132
+
133
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
134
+ __pypackages__/
135
+
136
+ # Celery stuff
137
+ celerybeat-schedule
138
+ celerybeat.pid
139
+
140
+ # SageMath parsed files
141
+ *.sage.py
142
+
143
+ # Environments
144
+ .env
145
+ .env_remote
146
+ .venv
147
+ env/
148
+ venv/
149
+ ENV/
150
+ env.bak/
151
+ venv.bak/
152
+ .env.armanonly
153
+
154
+ # Spyder project settings
155
+ .spyderproject
156
+ .spyproject
157
+
158
+ # Rope project settings
159
+ .ropeproject
160
+
161
+ # mkdocs documentation
162
+ /site
163
+
164
+ # mypy
165
+ .mypy_cache/
166
+ .dmypy.json
167
+ dmypy.json
168
+
169
+ # Pyre type checker
170
+ .pyre/
171
+
172
+ # random armani files
173
+ /armani_dev/secrets/
174
+ /armani/
175
+ /_armani/
176
+
177
+
178
+
179
+ # pytype static type analyzer
180
+ .pytype/
181
+
182
+ # Cython debug symbols
183
+ cython_debug/
184
+
185
+ .idea/
186
+ .vscode/
187
+ /node_modules/
188
+
189
+ dump.rdb
190
+
191
+ frontend/
192
+
193
+ # AME Temp Files and directory structure
194
+ # Ignore all files in the temp directory and its subdirectories
195
+ /temp/**/*
196
+ /tmp/**/*
197
+
198
+ # Allow .gitkeep files to retain directory structure
199
+ !/temp/**/.gitkeep
200
+ !/tmp/**/.gitkeep
201
+
202
+ # Armani
203
+ .history*
204
+ .history/
205
+ local_data/
206
+ local_reports_data/
207
+ webscraper/quick_scrapes/temp/
208
+ automation_matrix/ai_apis/fireworks/_dev/*
209
+ automation_matrix/ai_apis/fireworks/_dev/fireworks_sample.py
210
+ *.pdf
211
+ *.flac
212
+ *.mp3
213
+ *.wav
214
+ miniconda.sh
215
+ /database/python_sql/temp_data/
216
+ .history*
217
+ .history/
218
+ .history/
219
+
220
+ _dev/
221
+ /_dev/
222
+ requirements_filtered.txt
223
+
224
+ # matrx-dev-tools backups
225
+ .env-backups/
226
+ # Matrx Ship config (contains API key)
227
+ .matrx-ship.json
228
+
229
+ # Matrx config (contains API keys)
230
+ .matrx.json
231
+ .matrx-tools.conf
232
+
233
+ # Claude Code local worktrees and per-user settings
234
+ .claude/worktrees/
235
+ .claude/settings.local.json
236
+
237
+ # Append-only snapshots from matrx_utils.update_history (unbounded; do not commit)
238
+ common/utils/data_in_code/data_history.json
239
+ packages/matrx-utils/matrx_utils/data_in_code/data_history.json
240
+
241
+ # Tool-dispatch debug logs — one file per server start, never committed
242
+ .matrx-debug/
243
+
244
+ # macOS Finder metadata
245
+ .DS_Store
246
+ **/.DS_Store
247
+
248
+ # Environment files
249
+ .env
250
+ .env.*
251
+ *.env
252
+ *.env.*
253
+
254
+ # Keep safe templates trackable
255
+ !.env.example
256
+ !.env.sample
257
+ !.env.template
@@ -0,0 +1,104 @@
1
+ # CLAUDE.md — matrx-scraper
2
+
3
+ > **Operating Principle: Build the platform, not the artifact.** Every task is a probe that exposes a missing capability — build it, then consume it. Code that only serves one artifact is forbidden. Full doctrine: [/PRINCIPLES.md](../../PRINCIPLES.md).
4
+
5
+ **Package:** `matrx-scraper` (PyPI) — Python 3.12+ — currently v0.1.0 (alpha)
6
+ **Role in the graph:** Tier 2. Depends on `matrx-utils`. Optionally depends on `matrx-connect` (for streaming/events integration) and `matrx-orm` (for the Postgres-backed domain-config backend). Used by the aidream app and potentially by matrx-graph nodes.
7
+
8
+ ---
9
+
10
+ ## Read this first
11
+
12
+ matrx-scraper is the canonical web scraping + HTML parsing + site crawling + search + rendering pipeline for the Matrx family. It **replaces** the legacy root-level `scraper/` directory and parts of `research/`. Do not add new scraping logic to those folders.
13
+
14
+ This package is also a strong example of optional-sibling integration: it works standalone (just `pip install matrx-scraper`), degrades gracefully when matrx-connect isn't present, and uses an `_ext.py` registry for host-provided hooks.
15
+
16
+ ---
17
+
18
+ ## What this package provides
19
+
20
+ - **Scraping** (`matrx_scraper.scraper`, `matrx_scraper.orchestrator`): `scrape(url, **opts)`, `scrape_many(urls, ...)`, `scrape_many_stream(...)`, `ScrapeResult`, `ScrapeOptions`, `ScrapeService`.
21
+ - **Parsing pipeline** (`matrx_scraper.parser`): 8-stage HTML pipeline — normalize → `NoiseRemover` → `ScrapeFilter` → `ElementExtractor` → `LinkExtractor` → metadata (extruct) → hashing (MinHash/SimHash) → markdownify. Entry point: `parse_html(html, **opts)` and `ParserOrchestrator`.
22
+ - **Crawling** (`matrx_scraper.crawler`): `crawl_site(base_url)`, `SiteCrawler` — async BFS traversal, robots.txt-aware.
23
+ - **Search** (`matrx_scraper.search`): `BraveSearchClient`.
24
+ - **Caching** (`matrx_scraper.cache`): `CacheBackend` with `MemoryCache`, `TwoTierCache`.
25
+ - **Per-URL / per-domain config** (`matrx_scraper.domain_config`): `DomainConfigBackend`. Default is static; a Postgres-backed backend is available behind an optional `matrx-orm` extra.
26
+ - **Browser automation** (optional): `PlaywrightBrowserPool` behind the `[browser]` extra.
27
+ - **FastAPI server** (optional, `[server]` extra): `matrx-scraper` CLI / `server/__main__.py`, plus routers under `api/`.
28
+
29
+ Public API is the ~20 symbols in `matrx_scraper/__init__.py`. Keep stable.
30
+
31
+ ---
32
+
33
+ ## The `_ext` / `configure_ext` injection pattern
34
+
35
+ Because matrx-scraper wants to integrate with matrx-connect's event system when present, but also run standalone, it uses the `_ext` registry pattern:
36
+
37
+ ```python
38
+ # Host wires it at startup, passing matrx-connect types in:
39
+ import matrx_scraper
40
+ from matrx_connect.context.events import InfoPayload, WarningPayload
41
+ from matrx_connect.context.data_types import …
42
+
43
+ matrx_scraper.configure_ext(
44
+ info_payload_cls=InfoPayload,
45
+ warning_payload_cls=WarningPayload,
46
+ # …
47
+ )
48
+ ```
49
+
50
+ Inside this package:
51
+
52
+ - All matrx-connect imports are **conditional** — wrapped in `has_ext("connect")` checks or lazy imports behind functions.
53
+ - `ScrapeService` (the FastAPI adapter in `service.py`) accepts an injected `AppContext` shape; it does not import matrx-connect eagerly.
54
+ - Cache / domain-config / browser-pool backends are all injectable — pass them to the orchestrator or leave as defaults.
55
+
56
+ ---
57
+
58
+ ## Dependency rules specific to this package
59
+
60
+ - ✅ `from matrx_utils import …` — declared hard dep.
61
+ - ✅ `from matrx_connect import …` — **only** inside conditionally-imported code paths, and only when `has_ext("connect")` confirms it was configured. Never at module top level of a core scraping file.
62
+ - ✅ `from matrx_orm import …` — **only** inside the Postgres domain-config backend module, behind the `[postgres]` extra.
63
+ - ❌ No `from matrx_ai import …` (matrx-ai depends on graph, not scraper; keep it that way).
64
+ - ❌ No `from matrx_graph import …` at module top level. If a scraping step wants to run a graph, the host composes them; matrx-scraper doesn't orchestrate graphs internally.
65
+ - ❌ No `from aidream import …`. No imports from root `scraper/`, `research/`, `common/`, `config/`, `api_management/`.
66
+
67
+ ---
68
+
69
+ ## Python standards (same as root)
70
+
71
+ - Full type hints. Parser/scraper result types are public contracts — Pydantic.
72
+ - No docstrings except on the public API (`scrape`, `parse_html`, `ScrapeResult`, `ScrapeService`, `crawl_site`). One line.
73
+ - Hot paths: HTML parsing (selectolax over bs4 where both are available), noise removal, link classification. Avoid repeated DOM traversals; do one pass and reuse structures.
74
+ - Explicit exception handling — every network op has retry + timeout, and errors surface through the emitter when one is available.
75
+
76
+ ---
77
+
78
+ ## Testing this package in isolation
79
+
80
+ ```bash
81
+ uv run pytest packages/matrx-scraper/tests
82
+ ```
83
+
84
+ Tests must run with only matrx-utils installed. Browser + Postgres-backed tests live behind extras and skip when the extras aren't installed.
85
+
86
+ ---
87
+
88
+ ## Relationship to legacy `scraper/` and `research/`
89
+
90
+ Both are being **migrated out**. The internal docs in this package (`MIGRATION_STATUS.md`, `GAPS_TO_FIX.md`, `LEGACY_AUDIT.md`, `MIGRATION_GUIDE.md`) track specifics. High-level rules:
91
+
92
+ - Consumers in `research/` should import from `matrx_scraper`, not from the old `scraper/` directory.
93
+ - Features still missing from matrx-scraper (MCP tool adapters, auto-clicker, shadow-DOM walker, stateful browser sessions, disk cache, StackBlitz preprocessor, etc.) should be ported into this package, not extended inside `scraper/`.
94
+ - When you port a feature, delete the corresponding code in `scraper/` after confirming no remaining consumers.
95
+
96
+ See the package-internal migration docs **and** root `PACKAGES_MIGRATION_PLAN.md`.
97
+
98
+ ---
99
+
100
+ ## Known issues
101
+
102
+ - Some advanced features in `MIGRATION_STATUS.md` not yet ported.
103
+ - `scrape_many_stream` was added late; consumers may still be using buffered `scrape_many`. Prefer the streaming path.
104
+ - This package's code itself has no known aidream-coupling violations.
@@ -0,0 +1,50 @@
1
+ # Standalone scraper microservice.
2
+ # Build context: monorepo root (so we can access pyproject.toml + uv.lock + sibling workspace packages).
3
+ # Coolify config: base_directory=/, dockerfile_location=/packages/matrx-scraper/Dockerfile.
4
+
5
+ FROM python:3.13-slim AS base
6
+
7
+ ENV PYTHONUNBUFFERED=1 \
8
+ PYTHONDONTWRITEBYTECODE=1 \
9
+ PIP_DISABLE_PIP_VERSION_CHECK=1 \
10
+ UV_LINK_MODE=copy \
11
+ UV_COMPILE_BYTECODE=1
12
+
13
+ # System deps: curl for healthcheck. Playwright browser deps are installed
14
+ # later by `playwright install --with-deps`.
15
+ RUN apt-get update && apt-get install -y --no-install-recommends \
16
+ curl \
17
+ ca-certificates \
18
+ && rm -rf /var/lib/apt/lists/*
19
+
20
+ RUN pip install --no-cache-dir uv
21
+
22
+ WORKDIR /app
23
+
24
+ # uv sync needs the workspace root pyproject.toml + uv.lock and ALL workspace
25
+ # members listed in [tool.uv.workspace] members. We copy the whole packages/
26
+ # directory (~10 MB) — partial copies break workspace resolution.
27
+ COPY pyproject.toml uv.lock ./
28
+ COPY packages/ ./packages/
29
+
30
+ # Install matrx-scraper plus its [server] extras, using the frozen uv.lock
31
+ # from the workspace root. `--package matrx-scraper` focuses sync on this
32
+ # workspace member only — the heavy aidream-current root project is NOT
33
+ # installed. --no-dev skips dev-only tooling.
34
+ RUN uv sync --frozen --no-dev --package matrx-scraper --extra server
35
+
36
+ # Playwright Chromium + its OS deps. Largest layer; isolated for caching.
37
+ RUN uv run --no-sync playwright install --with-deps chromium
38
+
39
+ # Default to 8001 to match the existing Coolify Traefik labels and avoid
40
+ # proxy reconfiguration. ServerConfig.from_env() respects PORT.
41
+ ENV PORT=8001 \
42
+ HOST=0.0.0.0
43
+
44
+ EXPOSE 8001
45
+
46
+ # /health/ready confirms DB pool + cache are wired before traffic flows.
47
+ HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
48
+ CMD curl -fsS http://localhost:8001/health/ready >/dev/null || exit 1
49
+
50
+ CMD ["uv", "run", "--no-sync", "python", "-m", "matrx_scraper.server"]
@@ -0,0 +1,220 @@
1
+ # Gaps to Fix Before Switching the Router to `matrx-scraper`
2
+
3
+ This document lists only the things we **lose** by switching the FastAPI router
4
+ (`aidream/api/routers/scraper.py`) from the old `scraper/services_v2/service.py`
5
+ to the new `matrx_scraper` package. It is a to-do list for the new package, not
6
+ a comparison of the two systems.
7
+
8
+ Improvements in the new system are not listed here — they are already done and we
9
+ keep them. Frontend breaking changes are flagged so the React team can be notified.
10
+
11
+ ---
12
+
13
+ ## 1. Streaming — `scrape_many()` Does Not Stream
14
+
15
+ **Severity: BLOCKER**
16
+
17
+ The old system yields each result to the frontend *as it finishes*, so the user
18
+ sees pages arrive one by one. The new `scrape_many()` uses `asyncio.gather` and
19
+ returns a batch only after every URL in the list completes.
20
+
21
+ **What needs to be added to `orchestrator.py`:**
22
+
23
+ An async-generator companion:
24
+
25
+ ```python
26
+ async def scrape_many_stream(
27
+ urls: List[str],
28
+ use_proxy: bool = True,
29
+ concurrency: int = 5,
30
+ ) -> AsyncGenerator[ScrapeResult, None]:
31
+ semaphore = asyncio.Semaphore(concurrency)
32
+
33
+ async def _bounded(url: str) -> ScrapeResult:
34
+ async with semaphore:
35
+ return await scrape(url, use_proxy=use_proxy)
36
+
37
+ for coro in asyncio.as_completed([_bounded(u) for u in urls]):
38
+ yield await coro
39
+ ```
40
+
41
+ The `ScrapeService` adapter (Priority 2 in `MIGRATION_STATUS.md`) must use this
42
+ generator and call `emitter.send_data()` once per result, identical to how
43
+ `quick_scrape_stream()` works today.
44
+
45
+ ---
46
+
47
+ ## 2. Frontend-Breaking Output Schema Changes
48
+
49
+ The table below lists every field difference that will cause the frontend to break
50
+ or silently lose data. Fields the frontend never uses (new additions) are not listed.
51
+
52
+ | Old field | Status in new `ScrapeResult` | Action needed |
53
+ |---|---|---|
54
+ | `status` (`"success"` / `"error"` string) | Renamed to `success` (bool) | **React team must update**: change all `result.status === "success"` checks to `result.success === true` |
55
+ | `error` (error string or null) | Renamed to `failure_reason` | **React team must update**: rename `result.error` → `result.failure_reason` |
56
+ | `scraped_at` (ISO datetime string) | **Missing** | Add `scraped_at: str` to `ScrapeResult` and set it to `datetime.now(timezone.utc).isoformat()` inside `_build_result_from_response()` |
57
+ | `hashes` (list of strings) | Present, but now a `dict` with keys `minhash`, `simhash`, `outline_simhash` | **React team must update** hashes display. Old frontend typed this as `string[]` — it is now `{ minhash: number[], simhash: number, outline_simhash: number }`. This is an improvement; just needs a frontend update. |
58
+ | `overview.char_count_formatted` | **Missing from overview dict** | Add `char_count_formatted` to the overview dict built inside `element_extractor.py`. This was the raw text length (with formatting markers). |
59
+ | `overview.page_title` | Moved to top-level `title` field | The `overview` dict still has `page_title` via the pipeline, so this should already be present inside `overview`. Confirm `overview["page_title"]` is still populated. If so, **no action needed** — the old frontend read from `overview.page_title`, not the flat `title`. |
60
+ | `overview.outline` | Moved to top-level `document_outline` | Same as above — confirm `overview["outline"]` is still populated by the pipeline. The flat `document_outline` is additional, not a replacement. |
61
+ | `metadata` (was inside `overview`) | Moved to top-level `metadata` | Old frontend read from `overview.metadata`. The `overview` dict in the new pipeline still carries `metadata` inside it (via `element_extractor`). The new flat `result.metadata` is an extra top-level copy. **No action needed** as long as `overview["metadata"]` is still present. Verify this. |
62
+
63
+ ### Envelope shape
64
+
65
+ The service adapter must still emit the exact same SSE envelope the frontend expects:
66
+
67
+ ```json
68
+ {
69
+ "response_type": "fetch_results",
70
+ "metadata": { "execution_time_ms": 123.4 },
71
+ "results": [ ...per-page results... ]
72
+ }
73
+ ```
74
+
75
+ `scrape_many()` returns a bare `List[ScrapeResult]` with no envelope. The adapter
76
+ layer is responsible for wrapping it. This is not a gap in the package itself — it
77
+ belongs in the `ScrapeService` adapter noted in `MIGRATION_STATUS.md`.
78
+
79
+ ---
80
+
81
+ ## 3. Per-Field Output Selection (Boolean Flags)
82
+
83
+ **Severity: MEDIUM — no crash, but wastes bandwidth**
84
+
85
+ The old system respects boolean flags from the request (`get_overview`,
86
+ `get_structured_data`, `get_organized_data`, `get_links`, `get_text_data`,
87
+ `get_main_image`, `get_content_filter_removal_details`) and omits fields that are
88
+ `False`. The new `ScrapeResult.to_dict()` emits every non-None field unconditionally.
89
+
90
+ The `ScrapeService` adapter should apply the flags as a post-processing filter:
91
+
92
+ ```python
93
+ def apply_field_flags(result: dict, options: ScrapeOptionsBase) -> dict:
94
+ if not options.get_overview:
95
+ result.pop("overview", None)
96
+ if not options.get_organized_data:
97
+ result.pop("organized_data", None)
98
+ # ... etc.
99
+ return result
100
+ ```
101
+
102
+ This does not need to be in the `matrx_scraper` package — it belongs in the adapter.
103
+
104
+ ---
105
+
106
+ ## 4. Per-Domain Proxy / Permission Gating
107
+
108
+ **Severity: LOW for now — the feature existed but was rarely triggered**
109
+
110
+ The old `QuickScrapeManager` checked three DB tables before scraping each URL:
111
+
112
+ 1. `scrape_domain.scrape_allowed` — global on/off per domain
113
+ 2. `scrape_domain_quick_scrape_settings.enabled` — quick-scrape-specific on/off
114
+ 3. `scrape_domain_quick_scrape_settings.proxy_type` — which proxy tier to use
115
+
116
+ The new `scrape()` accepts only `use_proxy: bool` and `request_type: RequestType`.
117
+ There is no DB lookup; it's a binary on/off.
118
+
119
+ **What to build (when needed):** A lightweight domain-config resolver that reads
120
+ those same DB tables and translates them into a `use_proxy` bool and
121
+ `request_type` value before calling `scrape()`. This can live in the adapter layer,
122
+ not in the package itself.
123
+
124
+ **For now:** Default to `use_proxy=True` in the adapter. This matches the old
125
+ default behavior for domains with no explicit settings.
126
+
127
+ ---
128
+
129
+ ## 5. `scraped_at` Field Missing from `ScrapeResult`
130
+
131
+ This is also called out in item 2 above, but isolated here for clarity because it
132
+ requires a code change inside the package.
133
+
134
+ **File:** `packages/matrx-scraper/matrx_scraper/orchestrator.py`
135
+
136
+ **Fix:** Add `scraped_at: Optional[str] = None` to `ScrapeResult` and set it in
137
+ `_build_result_from_response()`:
138
+
139
+ ```python
140
+ from datetime import datetime, timezone
141
+
142
+ # inside _build_result_from_response(), after building `result`:
143
+ result.scraped_at = datetime.now(timezone.utc).isoformat()
144
+ ```
145
+
146
+ ---
147
+
148
+ ## Summary Checklist
149
+
150
+ ```
151
+ [x] Add scrape_many_stream() async generator to orchestrator.py (DONE)
152
+ [x] Add scraped_at field to ScrapeResult (DONE)
153
+ [x] Confirm overview["page_title"] still populated in pipeline output (CONFIRMED — core.py line ~165)
154
+ [x] Confirm overview["outline"] still populated in pipeline output (CONFIRMED — core.py line ~170)
155
+ [x] Confirm overview["metadata"] still populated in pipeline output (CONFIRMED — core.py line ~163)
156
+ [x] Add char_count_formatted back to overview dict (DONE — core.py)
157
+ [x] Build ScrapeService adapter (matrx_scraper/service.py) (DONE)
158
+ [x] ↳ adapter wraps scrape_many_stream(), emits per-result send_data()
159
+ [x] ↳ adapter builds fetch_results envelope with execution_time_ms
160
+ [x] ↳ adapter applies boolean field-flag filtering post-scrape
161
+ [x] ↳ adapter defaults proxy=True (DB lookup deferred — low priority)
162
+ [x] Router (aidream/api/routers/scraper.py) wired to matrx_scraper (DONE)
163
+ [ ] Notify React team:
164
+ [ ] ↳ result.status ("success"/"error") → result.success (bool)
165
+ [ ] ↳ result.error (string) → result.failure_reason (string)
166
+ [ ] ↳ result.hashes type changed: string[] → { minhash, simhash, outline_simhash }
167
+ ```
168
+
169
+ ---
170
+
171
+ ## 5. Domain-Config SQL Has No Canonical Schema
172
+
173
+ **Severity: LOW (no crash, but silently broken)**
174
+
175
+ `PostgresDomainConfigStore._load_all_domains()` ships hand-written SQL against
176
+ three tables — `scrape_domain`, `scrape_domain_settings`, `scrape_path_pattern`,
177
+ `scrape_path_override` — but the package does not own any DDL for those tables.
178
+ The column names are inferred from whatever the host's migrations happened to
179
+ create. We hit this in the wild on 2026-05-01: the package SQL referenced
180
+ `scrape_path_pattern.domain_id` and `scrape_path_pattern.pattern`, while the
181
+ actual columns are `scrape_domain_id` and `path_pattern`. The `_refresh()`
182
+ exception was caught and logged, so domain configs silently fell back to empty
183
+ and proxy/permission rules never engaged — this is the exact silent-degradation
184
+ mode the platform principle ("aggressively detected, not just patched") tells
185
+ us to eliminate.
186
+
187
+ **Two complementary platform fixes:**
188
+
189
+ 1. **Ship the DDL.** Add `matrx_scraper/sql/domain_config.sql` (or a tiny
190
+ migration set) that creates the four tables with the column names the
191
+ package's queries actually use. A standalone consumer runs it once; aidream
192
+ is a pre-existing host whose migrations already created compatible tables.
193
+ 2. **Pre-flight schema check.** On `PostgresDomainConfigStore.start()`, run a
194
+ `LIMIT 0` of each query inside a single connection. If any column is
195
+ missing, raise `RuntimeError` with the exact column-vs-table that doesn't
196
+ match — fail loudly at startup instead of silently caching `{}` for the
197
+ life of the process.
198
+
199
+ The hot-fix on 2026-05-01 just renamed the columns in-query (using SQL
200
+ aliases so the Python `row[...]` access is unchanged). That removes the
201
+ immediate failure but leaves the class of failure for the next drift.
202
+
203
+ ---
204
+
205
+ ## ✅ MCP Tools & Research — Migrated (done separately)
206
+
207
+ All active callers of the old `scraper_enhanced` parser have been switched to
208
+ `matrx_scraper`. The `ai_research_with_images` field and all concurrent patterns
209
+ are unchanged — only the import source changed.
210
+
211
+ | File | Change |
212
+ |---|---|
213
+ | `scraper/scraper_enhanced/features/read_page.py` | `scraper_enhanced.{scraper,parser.parser,content_extractors}` → `matrx_scraper.{scraper,parser.core,extractors}` |
214
+ | `scraper/scraper_enhanced/features/mcp_tool_helpers.py` | same |
215
+ | `scraper/scraper_enhanced/features/top_n_brave_results.py` | same |
216
+ | `research/multisource.py` | `scraper_enhanced.parser.parser.parse_html` → `matrx_scraper.parser.core.parse_html` |
217
+ | `research/scraper.py` | already on `matrx_scraper` — no change needed |
218
+
219
+ See `LEGACY_AUDIT.md` for the full inventory of remaining old-scraper references
220
+ (deprecated Socket.IO stack + dead scripts).