matterify 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- matterify-0.1.0/.gitignore +211 -0
- matterify-0.1.0/AGENTS.md +202 -0
- matterify-0.1.0/LICENSE +21 -0
- matterify-0.1.0/PKG-INFO +216 -0
- matterify-0.1.0/README.md +190 -0
- matterify-0.1.0/pyproject.toml +90 -0
- matterify-0.1.0/src/matterify/__init__.py +16 -0
- matterify-0.1.0/src/matterify/cli.py +113 -0
- matterify-0.1.0/src/matterify/constants.py +17 -0
- matterify-0.1.0/src/matterify/extractor.py +267 -0
- matterify-0.1.0/src/matterify/logging.py +49 -0
- matterify-0.1.0/src/matterify/models.py +77 -0
- matterify-0.1.0/src/matterify/scanner.py +43 -0
- matterify-0.1.0/tests/conftest.py +60 -0
- matterify-0.1.0/tests/test_cli.py +68 -0
- matterify-0.1.0/tests/test_extractor.py +310 -0
- matterify-0.1.0/tests/test_models.py +152 -0
- matterify-0.1.0/tests/test_scanner.py +85 -0
- matterify-0.1.0/tests/test_utils.py +13 -0
- matterify-0.1.0/uv.lock +811 -0
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[codz]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# C extensions
|
|
7
|
+
*.so
|
|
8
|
+
|
|
9
|
+
# Distribution / packaging
|
|
10
|
+
.Python
|
|
11
|
+
build/
|
|
12
|
+
develop-eggs/
|
|
13
|
+
dist/
|
|
14
|
+
downloads/
|
|
15
|
+
eggs/
|
|
16
|
+
.eggs/
|
|
17
|
+
lib/
|
|
18
|
+
lib64/
|
|
19
|
+
parts/
|
|
20
|
+
sdist/
|
|
21
|
+
var/
|
|
22
|
+
wheels/
|
|
23
|
+
share/python-wheels/
|
|
24
|
+
*.egg-info/
|
|
25
|
+
.installed.cfg
|
|
26
|
+
*.egg
|
|
27
|
+
MANIFEST
|
|
28
|
+
|
|
29
|
+
# PyInstaller
|
|
30
|
+
# Usually these files are written by a python script from a template
|
|
31
|
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
32
|
+
*.manifest
|
|
33
|
+
*.spec
|
|
34
|
+
|
|
35
|
+
# Installer logs
|
|
36
|
+
pip-log.txt
|
|
37
|
+
pip-delete-this-directory.txt
|
|
38
|
+
|
|
39
|
+
# Unit test / coverage reports
|
|
40
|
+
htmlcov/
|
|
41
|
+
.tox/
|
|
42
|
+
.nox/
|
|
43
|
+
.coverage
|
|
44
|
+
.coverage.*
|
|
45
|
+
.cache
|
|
46
|
+
nosetests.xml
|
|
47
|
+
coverage.xml
|
|
48
|
+
*.cover
|
|
49
|
+
*.py.cover
|
|
50
|
+
.hypothesis/
|
|
51
|
+
.pytest_cache/
|
|
52
|
+
cover/
|
|
53
|
+
|
|
54
|
+
# Translations
|
|
55
|
+
*.mo
|
|
56
|
+
*.pot
|
|
57
|
+
|
|
58
|
+
# Django stuff:
|
|
59
|
+
*.log
|
|
60
|
+
local_settings.py
|
|
61
|
+
db.sqlite3
|
|
62
|
+
db.sqlite3-journal
|
|
63
|
+
|
|
64
|
+
# Flask stuff:
|
|
65
|
+
instance/
|
|
66
|
+
.webassets-cache
|
|
67
|
+
|
|
68
|
+
# Scrapy stuff:
|
|
69
|
+
.scrapy
|
|
70
|
+
|
|
71
|
+
# Sphinx documentation
|
|
72
|
+
docs/_build/
|
|
73
|
+
|
|
74
|
+
# PyBuilder
|
|
75
|
+
.pybuilder/
|
|
76
|
+
target/
|
|
77
|
+
|
|
78
|
+
# Jupyter Notebook
|
|
79
|
+
.ipynb_checkpoints
|
|
80
|
+
|
|
81
|
+
# IPython
|
|
82
|
+
profile_default/
|
|
83
|
+
ipython_config.py
|
|
84
|
+
|
|
85
|
+
# pyenv
|
|
86
|
+
# For a library or package, you might want to ignore these files since the code is
|
|
87
|
+
# intended to run in multiple environments; otherwise, check them in:
|
|
88
|
+
# .python-version
|
|
89
|
+
|
|
90
|
+
# pipenv
|
|
91
|
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
|
92
|
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
|
93
|
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
|
94
|
+
# install all needed dependencies.
|
|
95
|
+
#Pipfile.lock
|
|
96
|
+
|
|
97
|
+
# UV
|
|
98
|
+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
|
99
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
100
|
+
# commonly ignored for libraries.
|
|
101
|
+
# uv.lock is tracked for this application to ensure reproducible builds
|
|
102
|
+
|
|
103
|
+
# poetry
|
|
104
|
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
|
105
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
106
|
+
# commonly ignored for libraries.
|
|
107
|
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
|
108
|
+
#poetry.lock
|
|
109
|
+
#poetry.toml
|
|
110
|
+
|
|
111
|
+
# pdm
|
|
112
|
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
|
113
|
+
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
|
|
114
|
+
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
|
|
115
|
+
#pdm.lock
|
|
116
|
+
#pdm.toml
|
|
117
|
+
.pdm-python
|
|
118
|
+
.pdm-build/
|
|
119
|
+
|
|
120
|
+
# pixi
|
|
121
|
+
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
|
|
122
|
+
#pixi.lock
|
|
123
|
+
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
|
|
124
|
+
# in the .venv directory. It is recommended not to include this directory in version control.
|
|
125
|
+
.pixi
|
|
126
|
+
|
|
127
|
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
|
128
|
+
__pypackages__/
|
|
129
|
+
|
|
130
|
+
# Celery stuff
|
|
131
|
+
celerybeat-schedule
|
|
132
|
+
celerybeat.pid
|
|
133
|
+
|
|
134
|
+
# SageMath parsed files
|
|
135
|
+
*.sage.py
|
|
136
|
+
|
|
137
|
+
# Environments
|
|
138
|
+
.env
|
|
139
|
+
.envrc
|
|
140
|
+
.venv
|
|
141
|
+
env/
|
|
142
|
+
venv/
|
|
143
|
+
ENV/
|
|
144
|
+
env.bak/
|
|
145
|
+
venv.bak/
|
|
146
|
+
|
|
147
|
+
# Spyder project settings
|
|
148
|
+
.spyderproject
|
|
149
|
+
.spyproject
|
|
150
|
+
|
|
151
|
+
# Rope project settings
|
|
152
|
+
.ropeproject
|
|
153
|
+
|
|
154
|
+
# mkdocs documentation
|
|
155
|
+
/site
|
|
156
|
+
|
|
157
|
+
# mypy
|
|
158
|
+
.mypy_cache/
|
|
159
|
+
.dmypy.json
|
|
160
|
+
dmypy.json
|
|
161
|
+
|
|
162
|
+
# Pyre type checker
|
|
163
|
+
.pyre/
|
|
164
|
+
|
|
165
|
+
# pytype static type analyzer
|
|
166
|
+
.pytype/
|
|
167
|
+
|
|
168
|
+
# Cython debug symbols
|
|
169
|
+
cython_debug/
|
|
170
|
+
|
|
171
|
+
# PyCharm
|
|
172
|
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
|
173
|
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
|
174
|
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
|
175
|
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
|
176
|
+
#.idea/
|
|
177
|
+
|
|
178
|
+
# Abstra
|
|
179
|
+
# Abstra is an AI-powered process automation framework.
|
|
180
|
+
# Ignore directories containing user credentials, local state, and settings.
|
|
181
|
+
# Learn more at https://abstra.io/docs
|
|
182
|
+
.abstra/
|
|
183
|
+
|
|
184
|
+
# Visual Studio Code
|
|
185
|
+
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
|
|
186
|
+
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
|
|
187
|
+
# and can be added to the global gitignore or merged into this file. However, if you prefer,
|
|
188
|
+
# you could uncomment the following to ignore the entire vscode folder
|
|
189
|
+
# .vscode/
|
|
190
|
+
|
|
191
|
+
# Ruff stuff:
|
|
192
|
+
.ruff_cache/
|
|
193
|
+
|
|
194
|
+
# PyPI configuration file
|
|
195
|
+
.pypirc
|
|
196
|
+
|
|
197
|
+
# Cursor
|
|
198
|
+
# Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
|
|
199
|
+
# exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
|
|
200
|
+
# refer to https://docs.cursor.com/context/ignore-files
|
|
201
|
+
.cursorignore
|
|
202
|
+
.cursorindexingignore
|
|
203
|
+
|
|
204
|
+
# Marimo
|
|
205
|
+
marimo/_static/
|
|
206
|
+
marimo/_lsp/
|
|
207
|
+
__marimo__/
|
|
208
|
+
session-ses_2f0a.md
|
|
209
|
+
|
|
210
|
+
# Benchmarks
|
|
211
|
+
.benchmarks/
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
# AGENTS.md
|
|
2
|
+
|
|
3
|
+
## Project description
|
|
4
|
+
`matterify` is a Python utility that recursively scans directory structures for Markdown files, extracts their embedded YAML frontmatter metadata, and aggregates all information into a structured, machine-readable JSON file for further processing.
|
|
5
|
+
|
|
6
|
+
## Project Structure
|
|
7
|
+
```text
|
|
8
|
+
matterify/
|
|
9
|
+
├── .python-version, pyproject.toml, uv.lock # Env & Dependency management
|
|
10
|
+
├── AGENTS.md, README.md, LICENSE # Docs & Guidelines
|
|
11
|
+
├── src/matterify/ # Source (src-layout)
|
|
12
|
+
│ ├── __init__.py # Metadata + public API entry points
|
|
13
|
+
│ ├── extractor.py # Frontmatter extraction & aggregation logic
|
|
14
|
+
│ ├── models.py # Frozen dataclasses (FrontmatterEntry, ScanMetadata, AggregatedResult)
|
|
15
|
+
│ ├── scanner.py # Directory traversal with blacklist filtering
|
|
16
|
+
│ ├── cli.py # Click CLI entry point
|
|
17
|
+
│ ├── logging.py # Debug & console config
|
|
18
|
+
│ └── utils/ # Utility modules
|
|
19
|
+
├── tests/ # Pytest suite
|
|
20
|
+
│ ├── conftest.py # Fixtures
|
|
21
|
+
│ ├── test_extractor.py, test_utils.py # Unit tests
|
|
22
|
+
│ └── test_cli.py # CLI tests
|
|
23
|
+
└── docs/ # MkDocs source
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
# Development Workflows
|
|
27
|
+
|
|
28
|
+
### UV Environment & Dependencies
|
|
29
|
+
- **Sync:** `uv sync` (add `--all-extras` for dev/docs).
|
|
30
|
+
- **Update:** `uv lock --upgrade`.
|
|
31
|
+
- **Management:** `uv add <pkg>` (use `--dev` for dev); `uv remove <pkg>`; `uv pip list`.
|
|
32
|
+
- **Strategy:** Use min constraints (e.g., `click>=8.1.0`) in `pyproject.toml`; rely on `uv.lock` for reproducibility. Avoid manual lock edits.
|
|
33
|
+
|
|
34
|
+
### Execution & Lifecycle
|
|
35
|
+
- **Run:** `uv run [matterify|python script.py|tool] [args]`.
|
|
36
|
+
- **Project:** `uv init` (setup); `uv check` (compat-check).
|
|
37
|
+
- **Dist:** `uv build` (wheel/sdist); `uv publish` (upload).
|
|
38
|
+
|
|
39
|
+
### Standards & Git
|
|
40
|
+
- **Versioning:** Strict SemVer (`MAJOR.MINOR.PATCH`).
|
|
41
|
+
- **Commits:** Follow Conventional Commits (e.g., `feat:`, `fix:`, `chore:`).
|
|
42
|
+
- **Automation:** **Never** commit autonomously; only execute on explicit user request.
|
|
43
|
+
|
|
44
|
+
## Testing & QA
|
|
45
|
+
|
|
46
|
+
### Quality Checks
|
|
47
|
+
**Tools:** `ruff` (lint/fmt), `mypy` (types). Prefix cmds with `uv run`.
|
|
48
|
+
- **Fmt/Lint:** `ruff format [--check] src/ tests/`, `ruff check src/ tests/`
|
|
49
|
+
- **Types:** `mypy src/`
|
|
50
|
+
- **Pre-Commit Gate:** `uv run ruff format src/ tests/ && uv run ruff check src/ tests/ && uv run mypy src/ && uv run pytest`
|
|
51
|
+
|
|
52
|
+
### Tests (`uv run pytest`)
|
|
53
|
+
- **Exec:** `.` (all), `-v` (verbose), `tests/[file].py[::func]` (targeted), `--cov=matterify --cov-report=html` (coverage).
|
|
54
|
+
- **Structure:** `tests/` dir 1:1 mapping (`extractor.py`->`test_extractor.py`, `utils/__init__.py`->`test_utils.py`, `cli.py`->`test_cli.py`).
|
|
55
|
+
- **FS Rules:** Prioritize critical paths. Use `tmp_path`. Name staging dirs `project/` (avoids `src/src/` nesting).
|
|
56
|
+
- **Paths:** Stored paths include top-level prefix (`project/src/main.py`). Assert via `endswith()` or `rglob()`.
|
|
57
|
+
- **Public API only:** Never import or call private symbols (names starting with `_`) from `src/` in tests. Test behaviour exclusively through the public API.
|
|
58
|
+
- **No inline imports:** All imports must be at the top of the test file. `import` statements inside test functions are forbidden.
|
|
59
|
+
|
|
60
|
+
## Tech Stack & Standards
|
|
61
|
+
- **Runtime:** Python 3.12.3
|
|
62
|
+
- **Concurrency:** `asyncio` (core)
|
|
63
|
+
- **Package Mgmt:** `uv` via `pyproject.toml` (Build: `hatchling`)
|
|
64
|
+
- **CLI/UI:** `click` (commands); `rich` (UI/verbose)
|
|
65
|
+
- **Logging:** `structlog` (debug/structured)
|
|
66
|
+
- **Parsing:** `pyyaml` (YAML Frontmatter)
|
|
67
|
+
- **Quality:** `ruff` (lint/fmt); `mypy` (strict)
|
|
68
|
+
- **Testing:** `pytest` (plugins: `benchmark`, `asyncio`)
|
|
69
|
+
- **Docs:** `mkdocs` with Material theme
|
|
70
|
+
|
|
71
|
+
## Coding Standards
|
|
72
|
+
- **Typing:** Strict `mypy` for `src/`; relaxed for `tests/`.
|
|
73
|
+
- **Type Aliases:** Use PEP 695 `type X = ...` (Python 3.12+). **Avoid** `TypeAlias` (ruff `UP040`).
|
|
74
|
+
- **Format:** PEP8 via `ruff`; 100 char limit.
|
|
75
|
+
- **Testing:** ≥1 unit test/function; use `tmp_path` for FS.
|
|
76
|
+
- **UI/Logging:** CLI silent by default. Use `structlog` for internal debug logs and `rich` for verbose user feedback. **Strictly isolate** UI output from internal loggers.
|
|
77
|
+
|
|
78
|
+
### Import Rules (ruff)
|
|
79
|
+
- **Order (I001):** stdlib → third-party → local. Separate with one blank line. Run `uv run ruff check --fix` or `format` to resolve.
|
|
80
|
+
- **Unused Imports (F401):** Remove immediately; every import must be referenced.
|
|
81
|
+
- **`TYPE_CHECKING` (TC005):** Delete empty `if TYPE_CHECKING: pass` blocks; use only if containing symbols.
|
|
82
|
+
- **Async-safe I/O (ASYNC240):** Never call blocking `pathlib.Path` methods inside `async def`. Wrap with `asyncio.to_thread(path.method, ...)`.
|
|
83
|
+
- **Pathlib over `os` (PTH):** Use `pathlib` equivalents (e.g., `Path.unlink()`) over `os`. Avoid `os` unless no `pathlib` alternative exists.
|
|
84
|
+
- **`contextlib.suppress` (SIM105):** Replace `try: ... except Error: pass` with `with contextlib.suppress(Error):`.
|
|
85
|
+
|
|
86
|
+
### mypy Rules (strict)
|
|
87
|
+
- **Return types:** All functions (including `__exit__`) require explicit annotations.
|
|
88
|
+
- **`__exit__` signature:** Use exact typing:
|
|
89
|
+
```python
|
|
90
|
+
def __exit__(
|
|
91
|
+
self,
|
|
92
|
+
exc_type: type[BaseException] | None,
|
|
93
|
+
exc_val: BaseException | None,
|
|
94
|
+
exc_tb: TracebackType | None,
|
|
95
|
+
) -> None:
|
|
96
|
+
```
|
|
97
|
+
Import `TracebackType` from `types` (use `if TYPE_CHECKING:` if preferred).
|
|
98
|
+
- **`asyncio.to_thread`:** Pass bound methods directly: `asyncio.to_thread(path.read_text, encoding="utf-8")`. Avoid lambdas to preserve return-type inference.
|
|
99
|
+
|
|
100
|
+
## Python API
|
|
101
|
+
|
|
102
|
+
### Public Functions
|
|
103
|
+
- `scan_directory(directory: Path, n_procs: int = 4, blacklist: tuple[str, ...] | None = None, compute_hash: bool = True, compute_stats: bool = True) -> AggregatedResult`: Scan directory and aggregate frontmatter using parallel workers. Returns an `AggregatedResult` dataclass.
|
|
104
|
+
|
|
105
|
+
### AggregatedResult Structure
|
|
106
|
+
The `scan_directory()` function returns an `AggregatedResult` dataclass:
|
|
107
|
+
```python
|
|
108
|
+
{
|
|
109
|
+
"metadata": ScanMetadata(...),
|
|
110
|
+
"files": list[FrontmatterEntry]
|
|
111
|
+
}
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
Where `ScanMetadata` contains:
|
|
115
|
+
```python
|
|
116
|
+
{
|
|
117
|
+
"source_directory": str,
|
|
118
|
+
"total_files": int,
|
|
119
|
+
"files_with_frontmatter": int,
|
|
120
|
+
"files_without_frontmatter": int,
|
|
121
|
+
"errors": int,
|
|
122
|
+
"scan_duration_seconds": float,
|
|
123
|
+
"avg_duration_per_file_ms": float,
|
|
124
|
+
"throughput_files_per_second": float,
|
|
125
|
+
}
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
### Public Types
|
|
129
|
+
- `FrontmatterEntry`: Dataclass representing extracted frontmatter from a single file.
|
|
130
|
+
- `ScanMetadata`: Dataclass containing summary statistics about a scan.
|
|
131
|
+
- `AggregatedResult`: Dataclass holding metadata and file entries.
|
|
132
|
+
|
|
133
|
+
### Status Values
|
|
134
|
+
- `ok`: File successfully parsed with valid YAML frontmatter.
|
|
135
|
+
- `illegal`: File has issues (no frontmatter, invalid YAML, or parse errors).
|
|
136
|
+
|
|
137
|
+
## CLI
|
|
138
|
+
|
|
139
|
+
### Usage
|
|
140
|
+
|
|
141
|
+
```bash
|
|
142
|
+
matterify DIRECTORY [OPTIONS]
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
**Arguments:**
|
|
146
|
+
- `directory`: Directory to scan (required).
|
|
147
|
+
|
|
148
|
+
**Options:**
|
|
149
|
+
- `--output`, `-o`: Write JSON to file instead of stdout (if omitted, outputs to stdout).
|
|
150
|
+
- `--n-procs`: Worker process count (default: auto-detect CPU cores).
|
|
151
|
+
- `--verbose`, `-v`: Show progress and summary.
|
|
152
|
+
- `--exclude`, `-e`: Additional directories to exclude.
|
|
153
|
+
- `--debug`: Enable debug logging.
|
|
154
|
+
- `--version`: Show version information.
|
|
155
|
+
|
|
156
|
+
## Logging & UI (`src/matterify/logging.py`)
|
|
157
|
+
- `configure_debug_logging(enabled)`: Configures `structlog`. Use `logging.CRITICAL` (50) for no-op. **Avoid `logging.CRITICAL + 1`** (causes `KeyError`).
|
|
158
|
+
- `get_console(verbose)`: Returns `rich.Console()`. Verbose writes to **stdout** for `CliRunner` capture; otherwise `quiet=True`.
|
|
159
|
+
|
|
160
|
+
### structlog Rules
|
|
161
|
+
- **Init**: Use `structlog.get_logger(__name__)`. **Never** `logging.getLogger()`.
|
|
162
|
+
- **Context**: Use kwargs: `logger.debug("msg", k=v)`. **Never** `extra={...}` (crashes on reserved keys like `name`).
|
|
163
|
+
|
|
164
|
+
### Established log fields
|
|
165
|
+
- `total_files`: Total number of Markdown files discovered.
|
|
166
|
+
- `with_frontmatter`: Count of files with valid frontmatter.
|
|
167
|
+
- `errors`: Count of files that produced errors.
|
|
168
|
+
- `duration`: Total scan duration in seconds.
|
|
169
|
+
- `output`: Path to the exported JSON file.
|
|
170
|
+
|
|
171
|
+
## Architecture & Mechanisms
|
|
172
|
+
|
|
173
|
+
### Module Overview
|
|
174
|
+
- `extractor.py`: Core extraction logic - parsing YAML frontmatter, parallel processing with `ProcessPoolExecutor`, JSON export.
|
|
175
|
+
- `scanner.py`: Directory traversal with blacklist filtering using `Path.walk()`.
|
|
176
|
+
- `models.py`: Frozen dataclasses for type-safe data structures.
|
|
177
|
+
- `logging.py`: `structlog` configuration and `rich.Console` factory.
|
|
178
|
+
- `cli.py`: Click-based CLI entry point.
|
|
179
|
+
|
|
180
|
+
### Extraction Pipeline
|
|
181
|
+
1. `iter_markdown_files()` discovers all `.md`/`.markdown` files, respecting blacklist.
|
|
182
|
+
2. `scan_directory()` distributes files across `ProcessPoolExecutor` workers.
|
|
183
|
+
3. Each worker runs `extract_frontmatter()` which:
|
|
184
|
+
- Reads file content as UTF-8
|
|
185
|
+
- Checks for `---` delimiters
|
|
186
|
+
- Parses YAML block with `yaml.safe_load`
|
|
187
|
+
- Validates frontmatter is a dictionary
|
|
188
|
+
- Serializes `datetime`/`date` objects to ISO strings
|
|
189
|
+
4. Results are sorted, deduplicated relative to root, and aggregated.
|
|
190
|
+
|
|
191
|
+
### File Status Classification
|
|
192
|
+
- `ok`: Valid YAML frontmatter found and parsed.
|
|
193
|
+
- `illegal`: No frontmatter, invalid YAML, or parse error.
|
|
194
|
+
|
|
195
|
+
## Docstring Rules
|
|
196
|
+
- **Format:** Google Style (`Args:`, `Returns:`, `Raises:`).
|
|
197
|
+
- **Markup:** Markdown ONLY; NO reST/Sphinx directives (`:class:`, etc.).
|
|
198
|
+
- **Code/Links:** Backticks (single inline, triple block). MkDocs autorefs (`[MyClass][]`).
|
|
199
|
+
- **Types:** Rely on Python type hints; do not duplicate in docstrings.
|
|
200
|
+
- **Style:** PEP 257 imperative mood ("Return X", not "Returns X").
|
|
201
|
+
- **Length:** One-liners for simple/private. Multi-line/sections ONLY for complex/public APIs. Omit redundant `Args:`/`Returns:`.
|
|
202
|
+
- **Staleness:** Always update docstrings, inline comments, and class `Supported modes:` when implementing scaffolds. Treat stale "not yet implemented" text as a bug.
|
matterify-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Christian Gröling
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
matterify-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,216 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: matterify
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Extract and aggregate YAML frontmatter from Markdown files into structured JSON
|
|
5
|
+
Project-URL: Homepage, https://github.com/chgroeling/matterify
|
|
6
|
+
Project-URL: Repository, https://github.com/chgroeling/matterify
|
|
7
|
+
Author-email: Christian Gröling <contact@christiangroeling.de>
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Keywords: cli,frontmatter,json,markdown,metadata,static-site,yaml
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Environment :: Console
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Operating System :: OS Independent
|
|
15
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Topic :: Software Development :: Documentation
|
|
18
|
+
Classifier: Topic :: Text Processing :: Markup :: Markdown
|
|
19
|
+
Classifier: Typing :: Typed
|
|
20
|
+
Requires-Python: >=3.12
|
|
21
|
+
Requires-Dist: click>=8.1.0
|
|
22
|
+
Requires-Dist: pyyaml>=6.0
|
|
23
|
+
Requires-Dist: rich>=13.0.0
|
|
24
|
+
Requires-Dist: structlog>=24.0.0
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
|
|
27
|
+
# Matterify
|
|
28
|
+
|
|
29
|
+
Extract and aggregate YAML frontmatter from Markdown files.
|
|
30
|
+
|
|
31
|
+
[](https://www.python.org/downloads/)
|
|
32
|
+
[](LICENSE)
|
|
33
|
+
[](https://pypi.org/project/matterify/)
|
|
34
|
+
|
|
35
|
+
## Quick Start
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
pip install matterify
|
|
39
|
+
matterify ./docs -o output.json
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## Installation
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
# Using uv (recommended)
|
|
46
|
+
uv add matterify
|
|
47
|
+
|
|
48
|
+
# Or with pip
|
|
49
|
+
pip install matterify
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## CLI Usage
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
matterify DIRECTORY [OPTIONS]
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
**Options:**
|
|
59
|
+
- `-o, --output PATH` - Write JSON to file instead of stdout (if omitted, outputs to stdout)
|
|
60
|
+
- `--n-procs INT` - Worker process count (default: auto-detect CPU cores)
|
|
61
|
+
- `-v, --verbose` - Show progress and summary
|
|
62
|
+
- `-e, --exclude TEXT` - Additional directories to exclude
|
|
63
|
+
- `--hash / --no-hash` - Enable/disable SHA-256 hash computation
|
|
64
|
+
- `--stats / --no-stats` - Enable/disable file statistics (size, modified time, access time)
|
|
65
|
+
- `--debug` - Enable debug logging
|
|
66
|
+
- `--version` - Show version information and exit
|
|
67
|
+
|
|
68
|
+
**Examples:**
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
# Output to stdout (JSON)
|
|
72
|
+
matterify ./docs
|
|
73
|
+
|
|
74
|
+
# Output to file
|
|
75
|
+
matterify ./docs -o output.json
|
|
76
|
+
|
|
77
|
+
# Verbose output
|
|
78
|
+
matterify ./docs --verbose
|
|
79
|
+
|
|
80
|
+
# Disable hashes and file stats
|
|
81
|
+
matterify ./docs --no-hash --no-stats
|
|
82
|
+
|
|
83
|
+
# Exclude additional directories
|
|
84
|
+
matterify ./docs -e build -e .cache
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
## Python API
|
|
88
|
+
|
|
89
|
+
### Public Functions
|
|
90
|
+
|
|
91
|
+
```python
|
|
92
|
+
from pathlib import Path
|
|
93
|
+
from matterify import (
|
|
94
|
+
scan_directory,
|
|
95
|
+
)
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
#### scan_directory
|
|
99
|
+
|
|
100
|
+
Scan directory and aggregate frontmatter using parallel workers. Returns an `AggregatedResult` dataclass.
|
|
101
|
+
|
|
102
|
+
```python
|
|
103
|
+
from pathlib import Path
|
|
104
|
+
from matterify import scan_directory
|
|
105
|
+
|
|
106
|
+
result = scan_directory(Path("./docs"))
|
|
107
|
+
|
|
108
|
+
# AggregatedResult contains:
|
|
109
|
+
# - result.metadata: ScanMetadata with scan statistics
|
|
110
|
+
# - result.files: list of file entries with extraction results
|
|
111
|
+
|
|
112
|
+
# Access metadata
|
|
113
|
+
print(result.metadata.total_files)
|
|
114
|
+
print(result.metadata.files_with_frontmatter)
|
|
115
|
+
print(result.metadata.scan_duration_seconds)
|
|
116
|
+
|
|
117
|
+
# Access files
|
|
118
|
+
for entry in result.files:
|
|
119
|
+
print(entry.file_path, entry.status)
|
|
120
|
+
print(entry.stats.file_size if entry.stats else None)
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
### Public Types
|
|
124
|
+
|
|
125
|
+
```python
|
|
126
|
+
from matterify import (
|
|
127
|
+
FileEntry,
|
|
128
|
+
ScanMetadata,
|
|
129
|
+
AggregatedResult,
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
# FileEntry: extracted frontmatter from a single file
|
|
133
|
+
entry: FileEntry
|
|
134
|
+
|
|
135
|
+
# ScanMetadata: summary statistics about a scan
|
|
136
|
+
metadata: ScanMetadata
|
|
137
|
+
|
|
138
|
+
# AggregatedResult: holds metadata and file entries
|
|
139
|
+
result: AggregatedResult
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
## JSON Output Structure
|
|
143
|
+
|
|
144
|
+
When using CLI (stdout or `--output`), the payload has this shape:
|
|
145
|
+
|
|
146
|
+
```json
|
|
147
|
+
{
|
|
148
|
+
"metadata": {
|
|
149
|
+
"source_directory": "/path/to/docs",
|
|
150
|
+
"total_files": 10,
|
|
151
|
+
"files_with_frontmatter": 8,
|
|
152
|
+
"files_without_frontmatter": 2,
|
|
153
|
+
"errors": 0,
|
|
154
|
+
"scan_duration_seconds": 0.523,
|
|
155
|
+
"avg_duration_per_file_ms": 52.3,
|
|
156
|
+
"throughput_files_per_second": 19.1
|
|
157
|
+
},
|
|
158
|
+
"files": [
|
|
159
|
+
{
|
|
160
|
+
"file_path": "getting-started.md",
|
|
161
|
+
"frontmatter": {
|
|
162
|
+
"title": "Getting Started",
|
|
163
|
+
"date": "2024-01-15",
|
|
164
|
+
"tags": ["guide", "tutorial"]
|
|
165
|
+
},
|
|
166
|
+
"status": "ok",
|
|
167
|
+
"error": null,
|
|
168
|
+
"stats": {
|
|
169
|
+
"file_size": 1234,
|
|
170
|
+
"modified_time": "2024-01-15T10:30:00",
|
|
171
|
+
"access_time": "2024-01-15T10:30:00"
|
|
172
|
+
},
|
|
173
|
+
"file_hash": "abc123..."
|
|
174
|
+
}
|
|
175
|
+
]
|
|
176
|
+
}
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
`status` is either `"ok"` or `"illegal"`.
|
|
180
|
+
|
|
181
|
+
## Default Exclusions
|
|
182
|
+
|
|
183
|
+
The following directories are excluded from scanning by default:
|
|
184
|
+
|
|
185
|
+
- `.git`
|
|
186
|
+
- `.obsidian`
|
|
187
|
+
- `__pycache__`
|
|
188
|
+
- `.venv`
|
|
189
|
+
- `venv`
|
|
190
|
+
- `node_modules`
|
|
191
|
+
- `.mypy_cache`
|
|
192
|
+
- `.pytest_cache`
|
|
193
|
+
- `.ruff_cache`
|
|
194
|
+
|
|
195
|
+
Use `-e` or `--exclude` to add custom exclusions.
|
|
196
|
+
|
|
197
|
+
## Development
|
|
198
|
+
|
|
199
|
+
```bash
|
|
200
|
+
# Install with dev dependencies
|
|
201
|
+
uv sync --all-extras
|
|
202
|
+
|
|
203
|
+
# Run tests
|
|
204
|
+
uv run pytest
|
|
205
|
+
|
|
206
|
+
# Format and lint
|
|
207
|
+
uv run ruff format src/ tests/
|
|
208
|
+
uv run ruff check src/ tests/
|
|
209
|
+
|
|
210
|
+
# Type check
|
|
211
|
+
uv run mypy src/
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
## License
|
|
215
|
+
|
|
216
|
+
MIT License - see [LICENSE](LICENSE) file for details.
|