matrx-scraper 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- matrx_scraper-0.1.0/.env.example +34 -0
- matrx_scraper-0.1.0/.gitignore +257 -0
- matrx_scraper-0.1.0/CLAUDE.md +104 -0
- matrx_scraper-0.1.0/Dockerfile +50 -0
- matrx_scraper-0.1.0/GAPS_TO_FIX.md +220 -0
- matrx_scraper-0.1.0/LEGACY_AUDIT.md +115 -0
- matrx_scraper-0.1.0/MIGRATION_GUIDE.md +366 -0
- matrx_scraper-0.1.0/MIGRATION_STATUS.md +159 -0
- matrx_scraper-0.1.0/PKG-INFO +179 -0
- matrx_scraper-0.1.0/README.md +104 -0
- matrx_scraper-0.1.0/SCHEMA.md +616 -0
- matrx_scraper-0.1.0/SCRAPER_SERVICE_GAPS.md +208 -0
- matrx_scraper-0.1.0/STANDALONE_USAGE.md +626 -0
- matrx_scraper-0.1.0/docker-compose.yml +40 -0
- matrx_scraper-0.1.0/matrx_scraper/__init__.py +232 -0
- matrx_scraper-0.1.0/matrx_scraper/_ext.py +43 -0
- matrx_scraper-0.1.0/matrx_scraper/ai_browser/__init__.py +112 -0
- matrx_scraper-0.1.0/matrx_scraper/ai_browser/actions.py +573 -0
- matrx_scraper-0.1.0/matrx_scraper/ai_browser/client.py +438 -0
- matrx_scraper-0.1.0/matrx_scraper/ai_browser/session.py +193 -0
- matrx_scraper-0.1.0/matrx_scraper/ai_tools/__init__.py +44 -0
- matrx_scraper-0.1.0/matrx_scraper/ai_tools/specs.py +575 -0
- matrx_scraper-0.1.0/matrx_scraper/api/__init__.py +26 -0
- matrx_scraper-0.1.0/matrx_scraper/api/browser_router.py +326 -0
- matrx_scraper-0.1.0/matrx_scraper/api/ext_router.py +322 -0
- matrx_scraper-0.1.0/matrx_scraper/api/preview_router.py +29 -0
- matrx_scraper-0.1.0/matrx_scraper/api/scrape_router.py +224 -0
- matrx_scraper-0.1.0/matrx_scraper/browser_pool.py +218 -0
- matrx_scraper-0.1.0/matrx_scraper/cache.py +164 -0
- matrx_scraper-0.1.0/matrx_scraper/crawler.py +1051 -0
- matrx_scraper-0.1.0/matrx_scraper/custom_extractors.py +196 -0
- matrx_scraper-0.1.0/matrx_scraper/domain_config.py +315 -0
- matrx_scraper-0.1.0/matrx_scraper/events.py +234 -0
- matrx_scraper-0.1.0/matrx_scraper/extractors.py +73 -0
- matrx_scraper-0.1.0/matrx_scraper/features/__init__.py +15 -0
- matrx_scraper-0.1.0/matrx_scraper/features/extensions.py +61 -0
- matrx_scraper-0.1.0/matrx_scraper/features/mcp_tool_helpers.py +175 -0
- matrx_scraper-0.1.0/matrx_scraper/features/quick_search.py +48 -0
- matrx_scraper-0.1.0/matrx_scraper/features/read_page.py +189 -0
- matrx_scraper-0.1.0/matrx_scraper/features/utils.py +36 -0
- matrx_scraper-0.1.0/matrx_scraper/graph_nodes/__init__.py +44 -0
- matrx_scraper-0.1.0/matrx_scraper/graph_nodes/scrape_actions.py +252 -0
- matrx_scraper-0.1.0/matrx_scraper/graph_nodes/stock_image_actions.py +218 -0
- matrx_scraper-0.1.0/matrx_scraper/gsc_bootstrap.py +262 -0
- matrx_scraper-0.1.0/matrx_scraper/mcp/__init__.py +35 -0
- matrx_scraper-0.1.0/matrx_scraper/mcp/__main__.py +6 -0
- matrx_scraper-0.1.0/matrx_scraper/mcp/server.py +257 -0
- matrx_scraper-0.1.0/matrx_scraper/orchestrator.py +383 -0
- matrx_scraper-0.1.0/matrx_scraper/pagerank.py +104 -0
- matrx_scraper-0.1.0/matrx_scraper/parser/__init__.py +22 -0
- matrx_scraper-0.1.0/matrx_scraper/parser/core.py +386 -0
- matrx_scraper-0.1.0/matrx_scraper/parser/data_types.py +631 -0
- matrx_scraper-0.1.0/matrx_scraper/parser/element_extractor.py +845 -0
- matrx_scraper-0.1.0/matrx_scraper/parser/extraction_rules.py +67 -0
- matrx_scraper-0.1.0/matrx_scraper/parser/flattener.py +441 -0
- matrx_scraper-0.1.0/matrx_scraper/parser/hashing.py +89 -0
- matrx_scraper-0.1.0/matrx_scraper/parser/link_extractor.py +165 -0
- matrx_scraper-0.1.0/matrx_scraper/parser/main_content.py +76 -0
- matrx_scraper-0.1.0/matrx_scraper/parser/noise_config.py +133 -0
- matrx_scraper-0.1.0/matrx_scraper/parser/noise_remover.py +163 -0
- matrx_scraper-0.1.0/matrx_scraper/parser/overrides.py +188 -0
- matrx_scraper-0.1.0/matrx_scraper/parser/scrape_filter.py +229 -0
- matrx_scraper-0.1.0/matrx_scraper/parser/scrape_json_to_text.py +165 -0
- matrx_scraper-0.1.0/matrx_scraper/parser/transform.py +297 -0
- matrx_scraper-0.1.0/matrx_scraper/parser/utils.py +409 -0
- matrx_scraper-0.1.0/matrx_scraper/performance.py +643 -0
- matrx_scraper-0.1.0/matrx_scraper/preview.py +195 -0
- matrx_scraper-0.1.0/matrx_scraper/queue_backend.py +114 -0
- matrx_scraper-0.1.0/matrx_scraper/rate_limiter.py +114 -0
- matrx_scraper-0.1.0/matrx_scraper/recipe_runtime.py +140 -0
- matrx_scraper-0.1.0/matrx_scraper/recipes.py +141 -0
- matrx_scraper-0.1.0/matrx_scraper/scraper.py +788 -0
- matrx_scraper-0.1.0/matrx_scraper/search/__init__.py +18 -0
- matrx_scraper-0.1.0/matrx_scraper/search/brave_client.py +114 -0
- matrx_scraper-0.1.0/matrx_scraper/search/rate_limiter.py +27 -0
- matrx_scraper-0.1.0/matrx_scraper/search/search.py +188 -0
- matrx_scraper-0.1.0/matrx_scraper/seo_audit.py +413 -0
- matrx_scraper-0.1.0/matrx_scraper/server/__init__.py +19 -0
- matrx_scraper-0.1.0/matrx_scraper/server/__main__.py +57 -0
- matrx_scraper-0.1.0/matrx_scraper/server/app.py +181 -0
- matrx_scraper-0.1.0/matrx_scraper/server/config.py +64 -0
- matrx_scraper-0.1.0/matrx_scraper/service.py +424 -0
- matrx_scraper-0.1.0/matrx_scraper/url_utils.py +30 -0
- matrx_scraper-0.1.0/matrx_scraper/utils/__init__.py +8 -0
- matrx_scraper-0.1.0/matrx_scraper/utils/url.py +239 -0
- matrx_scraper-0.1.0/pyproject.toml +95 -0
- matrx_scraper-0.1.0/scripts/release.sh +5 -0
- matrx_scraper-0.1.0/tests/test_crawler.py +176 -0
- matrx_scraper-0.1.0/tests/test_stock_image_actions.py +93 -0
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# Matrx Scraper — Standalone Server Configuration
|
|
3
|
+
# =============================================================================
|
|
4
|
+
# Copy this file to .env and fill in the values.
|
|
5
|
+
|
|
6
|
+
# --- Required ---
|
|
7
|
+
DATABASE_URL=postgresql://user:password@host:5432/dbname
|
|
8
|
+
SUPABASE_JWT_SECRET=your-supabase-jwt-secret
|
|
9
|
+
|
|
10
|
+
# --- Optional auth ---
|
|
11
|
+
# ADMIN_API_TOKEN is accepted by AuthMiddleware but the static-token escape
|
|
12
|
+
# hatch was removed in 2026-05 — it is ignored. Leave unset.
|
|
13
|
+
ADMIN_API_TOKEN=
|
|
14
|
+
|
|
15
|
+
# --- Search ---
|
|
16
|
+
BRAVE_API_KEY=
|
|
17
|
+
|
|
18
|
+
# --- Proxy ---
|
|
19
|
+
PROXY_DATACENTER_URL=
|
|
20
|
+
PROXY_RESIDENTIAL_URL=
|
|
21
|
+
|
|
22
|
+
# --- Browser pool ---
|
|
23
|
+
BROWSER_POOL_SIZE=3
|
|
24
|
+
ENABLE_BROWSER_POOL=true
|
|
25
|
+
|
|
26
|
+
# --- Feature toggles ---
|
|
27
|
+
ENABLE_CACHE=true
|
|
28
|
+
ENABLE_DOMAIN_CONFIG=true
|
|
29
|
+
|
|
30
|
+
# --- Server ---
|
|
31
|
+
HOST=0.0.0.0
|
|
32
|
+
PORT=8000
|
|
33
|
+
WORKERS=1
|
|
34
|
+
LOG_LEVEL=info
|
|
@@ -0,0 +1,257 @@
|
|
|
1
|
+
*.pyc
|
|
2
|
+
secrets/
|
|
3
|
+
ignore/
|
|
4
|
+
temp/
|
|
5
|
+
logs/
|
|
6
|
+
todo
|
|
7
|
+
text_notes/
|
|
8
|
+
aidream/secrets/2.env
|
|
9
|
+
automation_matrix/matrix_processing/temp/*
|
|
10
|
+
cd
|
|
11
|
+
# Byte-compiled / optimized / DLL files
|
|
12
|
+
__pycache__/
|
|
13
|
+
*.py[cod]
|
|
14
|
+
*$py.class
|
|
15
|
+
|
|
16
|
+
# C extensions
|
|
17
|
+
*.so
|
|
18
|
+
.venv/
|
|
19
|
+
|
|
20
|
+
# Distribution / packaging
|
|
21
|
+
.Python
|
|
22
|
+
build/
|
|
23
|
+
develop-eggs/
|
|
24
|
+
dist/
|
|
25
|
+
downloads/
|
|
26
|
+
eggs/
|
|
27
|
+
.eggs/
|
|
28
|
+
lib/
|
|
29
|
+
lib64/
|
|
30
|
+
# The blanket lib/ rule above is from the standard Python .gitignore template
|
|
31
|
+
# and was silently swallowing TS source under the SPA `src/lib/` folders.
|
|
32
|
+
# Re-allow them explicitly so frontend builds don't ship without their lib layer.
|
|
33
|
+
!dashboard/src/lib/
|
|
34
|
+
!dashboard/src/lib/**
|
|
35
|
+
!workflow-studio/src/lib/
|
|
36
|
+
!workflow-studio/src/lib/**
|
|
37
|
+
parts/
|
|
38
|
+
sdist/
|
|
39
|
+
var/
|
|
40
|
+
wheels/
|
|
41
|
+
share/python-wheels/
|
|
42
|
+
*.egg-info/
|
|
43
|
+
.installed.cfg
|
|
44
|
+
*.egg
|
|
45
|
+
MANIFEST
|
|
46
|
+
|
|
47
|
+
# PyInstaller
|
|
48
|
+
# Usually these files are written by a python script from a template
|
|
49
|
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
50
|
+
*.manifest
|
|
51
|
+
*.spec
|
|
52
|
+
|
|
53
|
+
# Installer logs
|
|
54
|
+
pip-log.txt
|
|
55
|
+
pip-delete-this-directory.txt
|
|
56
|
+
|
|
57
|
+
# Unit test / coverage reports
|
|
58
|
+
ai/tests/clean_response.json
|
|
59
|
+
ai/tests/cx_storage_response.json
|
|
60
|
+
ai/tests/execution_test.py
|
|
61
|
+
ai/tests/final_response.json
|
|
62
|
+
htmlcov/
|
|
63
|
+
.tox/
|
|
64
|
+
.nox/
|
|
65
|
+
.coverage
|
|
66
|
+
.coverage.*
|
|
67
|
+
.cache
|
|
68
|
+
nosetests.xml
|
|
69
|
+
coverage.xml
|
|
70
|
+
*.cover
|
|
71
|
+
*.py,cover
|
|
72
|
+
.hypothesis/
|
|
73
|
+
.pytest_cache/
|
|
74
|
+
cover/
|
|
75
|
+
|
|
76
|
+
# Translations
|
|
77
|
+
*.mo
|
|
78
|
+
*.pot
|
|
79
|
+
|
|
80
|
+
# Django stuff:
|
|
81
|
+
*.log
|
|
82
|
+
local_settings.py
|
|
83
|
+
db.sqlite3
|
|
84
|
+
db.sqlite3-journal
|
|
85
|
+
|
|
86
|
+
# Flask stuff:
|
|
87
|
+
instance/
|
|
88
|
+
.webassets-cache
|
|
89
|
+
|
|
90
|
+
# Scrapy stuff:
|
|
91
|
+
.scrapy
|
|
92
|
+
|
|
93
|
+
# Sphinx documentation
|
|
94
|
+
docs/_build/
|
|
95
|
+
|
|
96
|
+
# PyBuilder
|
|
97
|
+
.pybuilder/
|
|
98
|
+
target/
|
|
99
|
+
|
|
100
|
+
# Jupyter Notebook
|
|
101
|
+
.ipynb_checkpoints
|
|
102
|
+
|
|
103
|
+
# IPython
|
|
104
|
+
profile_default/
|
|
105
|
+
ipython_config.py
|
|
106
|
+
|
|
107
|
+
# pyenv
|
|
108
|
+
# For a library or package, you might want to ignore these files since the code is
|
|
109
|
+
# intended to run in multiple environments; otherwise, check them in:
|
|
110
|
+
# .python-version
|
|
111
|
+
|
|
112
|
+
# pipenv
|
|
113
|
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
|
114
|
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
|
115
|
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
|
116
|
+
# install all needed dependencies.
|
|
117
|
+
#Pipfile.lock
|
|
118
|
+
|
|
119
|
+
# poetry
|
|
120
|
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
|
121
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
122
|
+
# commonly ignored for libraries.
|
|
123
|
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
|
124
|
+
|
|
125
|
+
# pdm
|
|
126
|
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
|
127
|
+
#pdm.lock
|
|
128
|
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
|
129
|
+
# in version control.
|
|
130
|
+
# https://pdm.fming.dev/#use-with-ide
|
|
131
|
+
.pdm.toml
|
|
132
|
+
|
|
133
|
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
|
134
|
+
__pypackages__/
|
|
135
|
+
|
|
136
|
+
# Celery stuff
|
|
137
|
+
celerybeat-schedule
|
|
138
|
+
celerybeat.pid
|
|
139
|
+
|
|
140
|
+
# SageMath parsed files
|
|
141
|
+
*.sage.py
|
|
142
|
+
|
|
143
|
+
# Environments
|
|
144
|
+
.env
|
|
145
|
+
.env_remote
|
|
146
|
+
.venv
|
|
147
|
+
env/
|
|
148
|
+
venv/
|
|
149
|
+
ENV/
|
|
150
|
+
env.bak/
|
|
151
|
+
venv.bak/
|
|
152
|
+
.env.armanonly
|
|
153
|
+
|
|
154
|
+
# Spyder project settings
|
|
155
|
+
.spyderproject
|
|
156
|
+
.spyproject
|
|
157
|
+
|
|
158
|
+
# Rope project settings
|
|
159
|
+
.ropeproject
|
|
160
|
+
|
|
161
|
+
# mkdocs documentation
|
|
162
|
+
/site
|
|
163
|
+
|
|
164
|
+
# mypy
|
|
165
|
+
.mypy_cache/
|
|
166
|
+
.dmypy.json
|
|
167
|
+
dmypy.json
|
|
168
|
+
|
|
169
|
+
# Pyre type checker
|
|
170
|
+
.pyre/
|
|
171
|
+
|
|
172
|
+
# random armani files
|
|
173
|
+
/armani_dev/secrets/
|
|
174
|
+
/armani/
|
|
175
|
+
/_armani/
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
# pytype static type analyzer
|
|
180
|
+
.pytype/
|
|
181
|
+
|
|
182
|
+
# Cython debug symbols
|
|
183
|
+
cython_debug/
|
|
184
|
+
|
|
185
|
+
.idea/
|
|
186
|
+
.vscode/
|
|
187
|
+
/node_modules/
|
|
188
|
+
|
|
189
|
+
dump.rdb
|
|
190
|
+
|
|
191
|
+
frontend/
|
|
192
|
+
|
|
193
|
+
# AME Temp Files and directory structure
|
|
194
|
+
# Ignore all files in the temp directory and its subdirectories
|
|
195
|
+
/temp/**/*
|
|
196
|
+
/tmp/**/*
|
|
197
|
+
|
|
198
|
+
# Allow .gitkeep files to retain directory structure
|
|
199
|
+
!/temp/**/.gitkeep
|
|
200
|
+
!/tmp/**/.gitkeep
|
|
201
|
+
|
|
202
|
+
# Armani
|
|
203
|
+
.history*
|
|
204
|
+
.history/
|
|
205
|
+
local_data/
|
|
206
|
+
local_reports_data/
|
|
207
|
+
webscraper/quick_scrapes/temp/
|
|
208
|
+
automation_matrix/ai_apis/fireworks/_dev/*
|
|
209
|
+
automation_matrix/ai_apis/fireworks/_dev/fireworks_sample.py
|
|
210
|
+
*.pdf
|
|
211
|
+
*.flac
|
|
212
|
+
*.mp3
|
|
213
|
+
*.wav
|
|
214
|
+
miniconda.sh
|
|
215
|
+
/database/python_sql/temp_data/
|
|
216
|
+
.history*
|
|
217
|
+
.history/
|
|
218
|
+
.history/
|
|
219
|
+
|
|
220
|
+
_dev/
|
|
221
|
+
/_dev/
|
|
222
|
+
requirements_filtered.txt
|
|
223
|
+
|
|
224
|
+
# matrx-dev-tools backups
|
|
225
|
+
.env-backups/
|
|
226
|
+
# Matrx Ship config (contains API key)
|
|
227
|
+
.matrx-ship.json
|
|
228
|
+
|
|
229
|
+
# Matrx config (contains API keys)
|
|
230
|
+
.matrx.json
|
|
231
|
+
.matrx-tools.conf
|
|
232
|
+
|
|
233
|
+
# Claude Code local worktrees and per-user settings
|
|
234
|
+
.claude/worktrees/
|
|
235
|
+
.claude/settings.local.json
|
|
236
|
+
|
|
237
|
+
# Append-only snapshots from matrx_utils.update_history (unbounded; do not commit)
|
|
238
|
+
common/utils/data_in_code/data_history.json
|
|
239
|
+
packages/matrx-utils/matrx_utils/data_in_code/data_history.json
|
|
240
|
+
|
|
241
|
+
# Tool-dispatch debug logs — one file per server start, never committed
|
|
242
|
+
.matrx-debug/
|
|
243
|
+
|
|
244
|
+
# macOS Finder metadata
|
|
245
|
+
.DS_Store
|
|
246
|
+
**/.DS_Store
|
|
247
|
+
|
|
248
|
+
# Environment files
|
|
249
|
+
.env
|
|
250
|
+
.env.*
|
|
251
|
+
*.env
|
|
252
|
+
*.env.*
|
|
253
|
+
|
|
254
|
+
# Keep safe templates trackable
|
|
255
|
+
!.env.example
|
|
256
|
+
!.env.sample
|
|
257
|
+
!.env.template
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
# CLAUDE.md — matrx-scraper
|
|
2
|
+
|
|
3
|
+
> **Operating Principle: Build the platform, not the artifact.** Every task is a probe that exposes a missing capability — build it, then consume it. Code that only serves one artifact is forbidden. Full doctrine: [/PRINCIPLES.md](../../PRINCIPLES.md).
|
|
4
|
+
|
|
5
|
+
**Package:** `matrx-scraper` (PyPI) — Python 3.12+ — currently v0.1.0 (alpha)
|
|
6
|
+
**Role in the graph:** Tier 2. Depends on `matrx-utils`. Optionally depends on `matrx-connect` (for streaming/events integration) and `matrx-orm` (for the Postgres-backed domain-config backend). Used by the aidream app and potentially by matrx-graph nodes.
|
|
7
|
+
|
|
8
|
+
---
|
|
9
|
+
|
|
10
|
+
## Read this first
|
|
11
|
+
|
|
12
|
+
matrx-scraper is the canonical web scraping + HTML parsing + site crawling + search + rendering pipeline for the Matrx family. It **replaces** the legacy root-level `scraper/` directory and parts of `research/`. Do not add new scraping logic to those folders.
|
|
13
|
+
|
|
14
|
+
This package is also a strong example of optional-sibling integration: it works standalone (just `pip install matrx-scraper`), degrades gracefully when matrx-connect isn't present, and uses an `_ext.py` registry for host-provided hooks.
|
|
15
|
+
|
|
16
|
+
---
|
|
17
|
+
|
|
18
|
+
## What this package provides
|
|
19
|
+
|
|
20
|
+
- **Scraping** (`matrx_scraper.scraper`, `matrx_scraper.orchestrator`): `scrape(url, **opts)`, `scrape_many(urls, ...)`, `scrape_many_stream(...)`, `ScrapeResult`, `ScrapeOptions`, `ScrapeService`.
|
|
21
|
+
- **Parsing pipeline** (`matrx_scraper.parser`): 8-stage HTML pipeline — normalize → `NoiseRemover` → `ScrapeFilter` → `ElementExtractor` → `LinkExtractor` → metadata (extruct) → hashing (MinHash/SimHash) → markdownify. Entry point: `parse_html(html, **opts)` and `ParserOrchestrator`.
|
|
22
|
+
- **Crawling** (`matrx_scraper.crawler`): `crawl_site(base_url)`, `SiteCrawler` — async BFS traversal, robots.txt-aware.
|
|
23
|
+
- **Search** (`matrx_scraper.search`): `BraveSearchClient`.
|
|
24
|
+
- **Caching** (`matrx_scraper.cache`): `CacheBackend` with `MemoryCache`, `TwoTierCache`.
|
|
25
|
+
- **Per-URL / per-domain config** (`matrx_scraper.domain_config`): `DomainConfigBackend`. Default is static; a Postgres-backed backend is available behind an optional `matrx-orm` extra.
|
|
26
|
+
- **Browser automation** (optional): `PlaywrightBrowserPool` behind the `[browser]` extra.
|
|
27
|
+
- **FastAPI server** (optional, `[server]` extra): `matrx-scraper` CLI / `server/__main__.py`, plus routers under `api/`.
|
|
28
|
+
|
|
29
|
+
Public API is the ~20 symbols in `matrx_scraper/__init__.py`. Keep stable.
|
|
30
|
+
|
|
31
|
+
---
|
|
32
|
+
|
|
33
|
+
## The `_ext` / `configure_ext` injection pattern
|
|
34
|
+
|
|
35
|
+
Because matrx-scraper wants to integrate with matrx-connect's event system when present, but also run standalone, it uses the `_ext` registry pattern:
|
|
36
|
+
|
|
37
|
+
```python
|
|
38
|
+
# Host wires it at startup, passing matrx-connect types in:
|
|
39
|
+
import matrx_scraper
|
|
40
|
+
from matrx_connect.context.events import InfoPayload, WarningPayload
|
|
41
|
+
from matrx_connect.context.data_types import …
|
|
42
|
+
|
|
43
|
+
matrx_scraper.configure_ext(
|
|
44
|
+
info_payload_cls=InfoPayload,
|
|
45
|
+
warning_payload_cls=WarningPayload,
|
|
46
|
+
# …
|
|
47
|
+
)
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
Inside this package:
|
|
51
|
+
|
|
52
|
+
- All matrx-connect imports are **conditional** — wrapped in `has_ext("connect")` checks or lazy imports behind functions.
|
|
53
|
+
- `ScrapeService` (the FastAPI adapter in `service.py`) accepts an injected `AppContext` shape; it does not import matrx-connect eagerly.
|
|
54
|
+
- Cache / domain-config / browser-pool backends are all injectable — pass them to the orchestrator or leave as defaults.
|
|
55
|
+
|
|
56
|
+
---
|
|
57
|
+
|
|
58
|
+
## Dependency rules specific to this package
|
|
59
|
+
|
|
60
|
+
- ✅ `from matrx_utils import …` — declared hard dep.
|
|
61
|
+
- ✅ `from matrx_connect import …` — **only** inside conditionally-imported code paths, and only when `has_ext("connect")` confirms it was configured. Never at module top level of a core scraping file.
|
|
62
|
+
- ✅ `from matrx_orm import …` — **only** inside the Postgres domain-config backend module, behind the `[postgres]` extra.
|
|
63
|
+
- ❌ No `from matrx_ai import …` (matrx-ai depends on graph, not scraper; keep it that way).
|
|
64
|
+
- ❌ No `from matrx_graph import …` at module top level. If a scraping step wants to run a graph, the host composes them; matrx-scraper doesn't orchestrate graphs internally.
|
|
65
|
+
- ❌ No `from aidream import …`. No imports from root `scraper/`, `research/`, `common/`, `config/`, `api_management/`.
|
|
66
|
+
|
|
67
|
+
---
|
|
68
|
+
|
|
69
|
+
## Python standards (same as root)
|
|
70
|
+
|
|
71
|
+
- Full type hints. Parser/scraper result types are public contracts — Pydantic.
|
|
72
|
+
- No docstrings except on the public API (`scrape`, `parse_html`, `ScrapeResult`, `ScrapeService`, `crawl_site`). One line.
|
|
73
|
+
- Hot paths: HTML parsing (selectolax over bs4 where both are available), noise removal, link classification. Avoid repeated DOM traversals; do one pass and reuse structures.
|
|
74
|
+
- Explicit exception handling — every network op has retry + timeout, and errors surface through the emitter when one is available.
|
|
75
|
+
|
|
76
|
+
---
|
|
77
|
+
|
|
78
|
+
## Testing this package in isolation
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
uv run pytest packages/matrx-scraper/tests
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
Tests must run with only matrx-utils installed. Browser + Postgres-backed tests live behind extras and skip when the extras aren't installed.
|
|
85
|
+
|
|
86
|
+
---
|
|
87
|
+
|
|
88
|
+
## Relationship to legacy `scraper/` and `research/`
|
|
89
|
+
|
|
90
|
+
Both are being **migrated out**. The internal docs in this package (`MIGRATION_STATUS.md`, `GAPS_TO_FIX.md`, `LEGACY_AUDIT.md`, `MIGRATION_GUIDE.md`) track specifics. High-level rules:
|
|
91
|
+
|
|
92
|
+
- Consumers in `research/` should import from `matrx_scraper`, not from the old `scraper/` directory.
|
|
93
|
+
- Features still missing from matrx-scraper (MCP tool adapters, auto-clicker, shadow-DOM walker, stateful browser sessions, disk cache, StackBlitz preprocessor, etc.) should be ported into this package, not extended inside `scraper/`.
|
|
94
|
+
- When you port a feature, delete the corresponding code in `scraper/` after confirming no remaining consumers.
|
|
95
|
+
|
|
96
|
+
See the package-internal migration docs **and** root `PACKAGES_MIGRATION_PLAN.md`.
|
|
97
|
+
|
|
98
|
+
---
|
|
99
|
+
|
|
100
|
+
## Known issues
|
|
101
|
+
|
|
102
|
+
- Some advanced features in `MIGRATION_STATUS.md` not yet ported.
|
|
103
|
+
- `scrape_many_stream` was added late; consumers may still be using buffered `scrape_many`. Prefer the streaming path.
|
|
104
|
+
- This package's code itself has no known aidream-coupling violations.
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# Standalone scraper microservice.
|
|
2
|
+
# Build context: monorepo root (so we can access pyproject.toml + uv.lock + sibling workspace packages).
|
|
3
|
+
# Coolify config: base_directory=/, dockerfile_location=/packages/matrx-scraper/Dockerfile.
|
|
4
|
+
|
|
5
|
+
FROM python:3.13-slim AS base
|
|
6
|
+
|
|
7
|
+
ENV PYTHONUNBUFFERED=1 \
|
|
8
|
+
PYTHONDONTWRITEBYTECODE=1 \
|
|
9
|
+
PIP_DISABLE_PIP_VERSION_CHECK=1 \
|
|
10
|
+
UV_LINK_MODE=copy \
|
|
11
|
+
UV_COMPILE_BYTECODE=1
|
|
12
|
+
|
|
13
|
+
# System deps: curl for healthcheck. Playwright browser deps are installed
|
|
14
|
+
# later by `playwright install --with-deps`.
|
|
15
|
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
16
|
+
curl \
|
|
17
|
+
ca-certificates \
|
|
18
|
+
&& rm -rf /var/lib/apt/lists/*
|
|
19
|
+
|
|
20
|
+
RUN pip install --no-cache-dir uv
|
|
21
|
+
|
|
22
|
+
WORKDIR /app
|
|
23
|
+
|
|
24
|
+
# uv sync needs the workspace root pyproject.toml + uv.lock and ALL workspace
|
|
25
|
+
# members listed in [tool.uv.workspace] members. We copy the whole packages/
|
|
26
|
+
# directory (~10 MB) — partial copies break workspace resolution.
|
|
27
|
+
COPY pyproject.toml uv.lock ./
|
|
28
|
+
COPY packages/ ./packages/
|
|
29
|
+
|
|
30
|
+
# Install matrx-scraper plus its [server] extras, using the frozen uv.lock
|
|
31
|
+
# from the workspace root. `--package matrx-scraper` focuses sync on this
|
|
32
|
+
# workspace member only — the heavy aidream-current root project is NOT
|
|
33
|
+
# installed. --no-dev skips dev-only tooling.
|
|
34
|
+
RUN uv sync --frozen --no-dev --package matrx-scraper --extra server
|
|
35
|
+
|
|
36
|
+
# Playwright Chromium + its OS deps. Largest layer; isolated for caching.
|
|
37
|
+
RUN uv run --no-sync playwright install --with-deps chromium
|
|
38
|
+
|
|
39
|
+
# Default to 8001 to match the existing Coolify Traefik labels and avoid
|
|
40
|
+
# proxy reconfiguration. ServerConfig.from_env() respects PORT.
|
|
41
|
+
ENV PORT=8001 \
|
|
42
|
+
HOST=0.0.0.0
|
|
43
|
+
|
|
44
|
+
EXPOSE 8001
|
|
45
|
+
|
|
46
|
+
# /health/ready confirms DB pool + cache are wired before traffic flows.
|
|
47
|
+
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
|
|
48
|
+
CMD curl -fsS http://localhost:8001/health/ready >/dev/null || exit 1
|
|
49
|
+
|
|
50
|
+
CMD ["uv", "run", "--no-sync", "python", "-m", "matrx_scraper.server"]
|
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
# Gaps to Fix Before Switching the Router to `matrx-scraper`
|
|
2
|
+
|
|
3
|
+
This document lists only the things we **lose** by switching the FastAPI router
|
|
4
|
+
(`aidream/api/routers/scraper.py`) from the old `scraper/services_v2/service.py`
|
|
5
|
+
to the new `matrx_scraper` package. It is a to-do list for the new package, not
|
|
6
|
+
a comparison of the two systems.
|
|
7
|
+
|
|
8
|
+
Improvements in the new system are not listed here — they are already done and we
|
|
9
|
+
keep them. Frontend breaking changes are flagged so the React team can be notified.
|
|
10
|
+
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
## 1. Streaming — `scrape_many()` Does Not Stream
|
|
14
|
+
|
|
15
|
+
**Severity: BLOCKER**
|
|
16
|
+
|
|
17
|
+
The old system yields each result to the frontend *as it finishes*, so the user
|
|
18
|
+
sees pages arrive one by one. The new `scrape_many()` uses `asyncio.gather` and
|
|
19
|
+
returns a batch only after every URL in the list completes.
|
|
20
|
+
|
|
21
|
+
**What needs to be added to `orchestrator.py`:**
|
|
22
|
+
|
|
23
|
+
An async-generator companion:
|
|
24
|
+
|
|
25
|
+
```python
|
|
26
|
+
async def scrape_many_stream(
|
|
27
|
+
urls: List[str],
|
|
28
|
+
use_proxy: bool = True,
|
|
29
|
+
concurrency: int = 5,
|
|
30
|
+
) -> AsyncGenerator[ScrapeResult, None]:
|
|
31
|
+
semaphore = asyncio.Semaphore(concurrency)
|
|
32
|
+
|
|
33
|
+
async def _bounded(url: str) -> ScrapeResult:
|
|
34
|
+
async with semaphore:
|
|
35
|
+
return await scrape(url, use_proxy=use_proxy)
|
|
36
|
+
|
|
37
|
+
for coro in asyncio.as_completed([_bounded(u) for u in urls]):
|
|
38
|
+
yield await coro
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
The `ScrapeService` adapter (Priority 2 in `MIGRATION_STATUS.md`) must use this
|
|
42
|
+
generator and call `emitter.send_data()` once per result, identical to how
|
|
43
|
+
`quick_scrape_stream()` works today.
|
|
44
|
+
|
|
45
|
+
---
|
|
46
|
+
|
|
47
|
+
## 2. Frontend-Breaking Output Schema Changes
|
|
48
|
+
|
|
49
|
+
The table below lists every field difference that will cause the frontend to break
|
|
50
|
+
or silently lose data. Fields the frontend never uses (new additions) are not listed.
|
|
51
|
+
|
|
52
|
+
| Old field | Status in new `ScrapeResult` | Action needed |
|
|
53
|
+
|---|---|---|
|
|
54
|
+
| `status` (`"success"` / `"error"` string) | Renamed to `success` (bool) | **React team must update**: change all `result.status === "success"` checks to `result.success === true` |
|
|
55
|
+
| `error` (error string or null) | Renamed to `failure_reason` | **React team must update**: rename `result.error` → `result.failure_reason` |
|
|
56
|
+
| `scraped_at` (ISO datetime string) | **Missing** | Add `scraped_at: str` to `ScrapeResult` and set it to `datetime.now(timezone.utc).isoformat()` inside `_build_result_from_response()` |
|
|
57
|
+
| `hashes` (list of strings) | Present, but now a `dict` with keys `minhash`, `simhash`, `outline_simhash` | **React team must update** hashes display. Old frontend typed this as `string[]` — it is now `{ minhash: number[], simhash: number, outline_simhash: number }`. This is an improvement; just needs a frontend update. |
|
|
58
|
+
| `overview.char_count_formatted` | **Missing from overview dict** | Add `char_count_formatted` to the overview dict built inside `element_extractor.py`. This was the raw text length (with formatting markers). |
|
|
59
|
+
| `overview.page_title` | Moved to top-level `title` field | The `overview` dict still has `page_title` via the pipeline, so this should already be present inside `overview`. Confirm `overview["page_title"]` is still populated. If so, **no action needed** — the old frontend read from `overview.page_title`, not the flat `title`. |
|
|
60
|
+
| `overview.outline` | Moved to top-level `document_outline` | Same as above — confirm `overview["outline"]` is still populated by the pipeline. The flat `document_outline` is additional, not a replacement. |
|
|
61
|
+
| `metadata` (was inside `overview`) | Moved to top-level `metadata` | Old frontend read from `overview.metadata`. The `overview` dict in the new pipeline still carries `metadata` inside it (via `element_extractor`). The new flat `result.metadata` is an extra top-level copy. **No action needed** as long as `overview["metadata"]` is still present. Verify this. |
|
|
62
|
+
|
|
63
|
+
### Envelope shape
|
|
64
|
+
|
|
65
|
+
The service adapter must still emit the exact same SSE envelope the frontend expects:
|
|
66
|
+
|
|
67
|
+
```json
|
|
68
|
+
{
|
|
69
|
+
"response_type": "fetch_results",
|
|
70
|
+
"metadata": { "execution_time_ms": 123.4 },
|
|
71
|
+
"results": [ ...per-page results... ]
|
|
72
|
+
}
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
`scrape_many()` returns a bare `List[ScrapeResult]` with no envelope. The adapter
|
|
76
|
+
layer is responsible for wrapping it. This is not a gap in the package itself — it
|
|
77
|
+
belongs in the `ScrapeService` adapter noted in `MIGRATION_STATUS.md`.
|
|
78
|
+
|
|
79
|
+
---
|
|
80
|
+
|
|
81
|
+
## 3. Per-Field Output Selection (Boolean Flags)
|
|
82
|
+
|
|
83
|
+
**Severity: MEDIUM — no crash, but wastes bandwidth**
|
|
84
|
+
|
|
85
|
+
The old system respects boolean flags from the request (`get_overview`,
|
|
86
|
+
`get_structured_data`, `get_organized_data`, `get_links`, `get_text_data`,
|
|
87
|
+
`get_main_image`, `get_content_filter_removal_details`) and omits fields that are
|
|
88
|
+
`False`. The new `ScrapeResult.to_dict()` emits every non-None field unconditionally.
|
|
89
|
+
|
|
90
|
+
The `ScrapeService` adapter should apply the flags as a post-processing filter:
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
def apply_field_flags(result: dict, options: ScrapeOptionsBase) -> dict:
|
|
94
|
+
if not options.get_overview:
|
|
95
|
+
result.pop("overview", None)
|
|
96
|
+
if not options.get_organized_data:
|
|
97
|
+
result.pop("organized_data", None)
|
|
98
|
+
# ... etc.
|
|
99
|
+
return result
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
This does not need to be in the `matrx_scraper` package — it belongs in the adapter.
|
|
103
|
+
|
|
104
|
+
---
|
|
105
|
+
|
|
106
|
+
## 4. Per-Domain Proxy / Permission Gating
|
|
107
|
+
|
|
108
|
+
**Severity: LOW for now — the feature existed but was rarely triggered**
|
|
109
|
+
|
|
110
|
+
The old `QuickScrapeManager` checked three DB tables before scraping each URL:
|
|
111
|
+
|
|
112
|
+
1. `scrape_domain.scrape_allowed` — global on/off per domain
|
|
113
|
+
2. `scrape_domain_quick_scrape_settings.enabled` — quick-scrape-specific on/off
|
|
114
|
+
3. `scrape_domain_quick_scrape_settings.proxy_type` — which proxy tier to use
|
|
115
|
+
|
|
116
|
+
The new `scrape()` accepts only `use_proxy: bool` and `request_type: RequestType`.
|
|
117
|
+
There is no DB lookup; it's a binary on/off.
|
|
118
|
+
|
|
119
|
+
**What to build (when needed):** A lightweight domain-config resolver that reads
|
|
120
|
+
those same DB tables and translates them into a `use_proxy` bool and
|
|
121
|
+
`request_type` value before calling `scrape()`. This can live in the adapter layer,
|
|
122
|
+
not in the package itself.
|
|
123
|
+
|
|
124
|
+
**For now:** Default to `use_proxy=True` in the adapter. This matches the old
|
|
125
|
+
default behavior for domains with no explicit settings.
|
|
126
|
+
|
|
127
|
+
---
|
|
128
|
+
|
|
129
|
+
## 5. `scraped_at` Field Missing from `ScrapeResult`
|
|
130
|
+
|
|
131
|
+
This is also called out in item 2 above, but isolated here for clarity because it
|
|
132
|
+
requires a code change inside the package.
|
|
133
|
+
|
|
134
|
+
**File:** `packages/matrx-scraper/matrx_scraper/orchestrator.py`
|
|
135
|
+
|
|
136
|
+
**Fix:** Add `scraped_at: Optional[str] = None` to `ScrapeResult` and set it in
|
|
137
|
+
`_build_result_from_response()`:
|
|
138
|
+
|
|
139
|
+
```python
|
|
140
|
+
from datetime import datetime, timezone
|
|
141
|
+
|
|
142
|
+
# inside _build_result_from_response(), after building `result`:
|
|
143
|
+
result.scraped_at = datetime.now(timezone.utc).isoformat()
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
---
|
|
147
|
+
|
|
148
|
+
## Summary Checklist
|
|
149
|
+
|
|
150
|
+
```
|
|
151
|
+
[x] Add scrape_many_stream() async generator to orchestrator.py (DONE)
|
|
152
|
+
[x] Add scraped_at field to ScrapeResult (DONE)
|
|
153
|
+
[x] Confirm overview["page_title"] still populated in pipeline output (CONFIRMED — core.py line ~165)
|
|
154
|
+
[x] Confirm overview["outline"] still populated in pipeline output (CONFIRMED — core.py line ~170)
|
|
155
|
+
[x] Confirm overview["metadata"] still populated in pipeline output (CONFIRMED — core.py line ~163)
|
|
156
|
+
[x] Add char_count_formatted back to overview dict (DONE — core.py)
|
|
157
|
+
[x] Build ScrapeService adapter (matrx_scraper/service.py) (DONE)
|
|
158
|
+
[x] ↳ adapter wraps scrape_many_stream(), emits per-result send_data()
|
|
159
|
+
[x] ↳ adapter builds fetch_results envelope with execution_time_ms
|
|
160
|
+
[x] ↳ adapter applies boolean field-flag filtering post-scrape
|
|
161
|
+
[x] ↳ adapter defaults proxy=True (DB lookup deferred — low priority)
|
|
162
|
+
[x] Router (aidream/api/routers/scraper.py) wired to matrx_scraper (DONE)
|
|
163
|
+
[ ] Notify React team:
|
|
164
|
+
[ ] ↳ result.status ("success"/"error") → result.success (bool)
|
|
165
|
+
[ ] ↳ result.error (string) → result.failure_reason (string)
|
|
166
|
+
[ ] ↳ result.hashes type changed: string[] → { minhash, simhash, outline_simhash }
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
---
|
|
170
|
+
|
|
171
|
+
## 5. Domain-Config SQL Has No Canonical Schema
|
|
172
|
+
|
|
173
|
+
**Severity: LOW (no crash, but silently broken)**
|
|
174
|
+
|
|
175
|
+
`PostgresDomainConfigStore._load_all_domains()` ships hand-written SQL against
|
|
176
|
+
three tables — `scrape_domain`, `scrape_domain_settings`, `scrape_path_pattern`,
|
|
177
|
+
`scrape_path_override` — but the package does not own any DDL for those tables.
|
|
178
|
+
The column names are inferred from whatever the host's migrations happened to
|
|
179
|
+
create. We hit this in the wild on 2026-05-01: the package SQL referenced
|
|
180
|
+
`scrape_path_pattern.domain_id` and `scrape_path_pattern.pattern`, while the
|
|
181
|
+
actual columns are `scrape_domain_id` and `path_pattern`. The `_refresh()`
|
|
182
|
+
exception was caught and logged, so domain configs silently fell back to empty
|
|
183
|
+
and proxy/permission rules never engaged — this is the exact silent-degradation
|
|
184
|
+
mode the platform principle ("aggressively detected, not just patched") tells
|
|
185
|
+
us to eliminate.
|
|
186
|
+
|
|
187
|
+
**Two complementary platform fixes:**
|
|
188
|
+
|
|
189
|
+
1. **Ship the DDL.** Add `matrx_scraper/sql/domain_config.sql` (or a tiny
|
|
190
|
+
migration set) that creates the four tables with the column names the
|
|
191
|
+
package's queries actually use. A standalone consumer runs it once; aidream
|
|
192
|
+
is a pre-existing host whose migrations already created compatible tables.
|
|
193
|
+
2. **Pre-flight schema check.** On `PostgresDomainConfigStore.start()`, run a
|
|
194
|
+
`LIMIT 0` of each query inside a single connection. If any column is
|
|
195
|
+
missing, raise `RuntimeError` with the exact column-vs-table that doesn't
|
|
196
|
+
match — fail loudly at startup instead of silently caching `{}` for the
|
|
197
|
+
life of the process.
|
|
198
|
+
|
|
199
|
+
The hot-fix on 2026-05-01 just renamed the columns in-query (using SQL
|
|
200
|
+
aliases so the Python `row[...]` access is unchanged). That removes the
|
|
201
|
+
immediate failure but leaves the class of failure for the next drift.
|
|
202
|
+
|
|
203
|
+
---
|
|
204
|
+
|
|
205
|
+
## ✅ MCP Tools & Research — Migrated (done separately)
|
|
206
|
+
|
|
207
|
+
All active callers of the old `scraper_enhanced` parser have been switched to
|
|
208
|
+
`matrx_scraper`. The `ai_research_with_images` field and all concurrent patterns
|
|
209
|
+
are unchanged — only the import source changed.
|
|
210
|
+
|
|
211
|
+
| File | Change |
|
|
212
|
+
|---|---|
|
|
213
|
+
| `scraper/scraper_enhanced/features/read_page.py` | `scraper_enhanced.{scraper,parser.parser,content_extractors}` → `matrx_scraper.{scraper,parser.core,extractors}` |
|
|
214
|
+
| `scraper/scraper_enhanced/features/mcp_tool_helpers.py` | same |
|
|
215
|
+
| `scraper/scraper_enhanced/features/top_n_brave_results.py` | same |
|
|
216
|
+
| `research/multisource.py` | `scraper_enhanced.parser.parser.parse_html` → `matrx_scraper.parser.core.parse_html` |
|
|
217
|
+
| `research/scraper.py` | already on `matrx_scraper` — no change needed |
|
|
218
|
+
|
|
219
|
+
See `LEGACY_AUDIT.md` for the full inventory of remaining old-scraper references
|
|
220
|
+
(deprecated Socket.IO stack + dead scripts).
|