mnemebrain-benchmark 0.1.0a1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. mnemebrain_benchmark-0.1.0a1/.github/ISSUE_TEMPLATE/bug_report.yml +66 -0
  2. mnemebrain_benchmark-0.1.0a1/.github/ISSUE_TEMPLATE/config.yml +2 -0
  3. mnemebrain_benchmark-0.1.0a1/.github/ISSUE_TEMPLATE/feature_request.yml +44 -0
  4. mnemebrain_benchmark-0.1.0a1/.github/ISSUE_TEMPLATE/new_adapter.yml +61 -0
  5. mnemebrain_benchmark-0.1.0a1/.github/workflows/ci.yml +84 -0
  6. mnemebrain_benchmark-0.1.0a1/.github/workflows/codeql.yml +54 -0
  7. mnemebrain_benchmark-0.1.0a1/.github/workflows/dependency-review.yml +19 -0
  8. mnemebrain_benchmark-0.1.0a1/.github/workflows/pylint.yml +31 -0
  9. mnemebrain_benchmark-0.1.0a1/.github/workflows/release.yml +96 -0
  10. mnemebrain_benchmark-0.1.0a1/.gitignore +19 -0
  11. mnemebrain_benchmark-0.1.0a1/BMB_REPORT.md +417 -0
  12. mnemebrain_benchmark-0.1.0a1/CONTRIBUTING.md +353 -0
  13. mnemebrain_benchmark-0.1.0a1/LICENSE +21 -0
  14. mnemebrain_benchmark-0.1.0a1/PKG-INFO +26 -0
  15. mnemebrain_benchmark-0.1.0a1/README.md +128 -0
  16. mnemebrain_benchmark-0.1.0a1/docs/adding-adapters.md +211 -0
  17. mnemebrain_benchmark-0.1.0a1/docs/architecture.md +118 -0
  18. mnemebrain_benchmark-0.1.0a1/pyproject.toml +46 -0
  19. mnemebrain_benchmark-0.1.0a1/src/mnemebrain_benchmark/__init__.py +1 -0
  20. mnemebrain_benchmark-0.1.0a1/src/mnemebrain_benchmark/__main__.py +4 -0
  21. mnemebrain_benchmark-0.1.0a1/src/mnemebrain_benchmark/adapters/__init__.py +1 -0
  22. mnemebrain_benchmark-0.1.0a1/src/mnemebrain_benchmark/adapters/langchain_buffer.py +55 -0
  23. mnemebrain_benchmark-0.1.0a1/src/mnemebrain_benchmark/adapters/mem0_adapter.py +181 -0
  24. mnemebrain_benchmark-0.1.0a1/src/mnemebrain_benchmark/adapters/mnemebrain_adapter.py +216 -0
  25. mnemebrain_benchmark-0.1.0a1/src/mnemebrain_benchmark/adapters/naive_baseline.py +68 -0
  26. mnemebrain_benchmark-0.1.0a1/src/mnemebrain_benchmark/adapters/openai_rag_adapter.py +110 -0
  27. mnemebrain_benchmark-0.1.0a1/src/mnemebrain_benchmark/adapters/rag_baseline.py +77 -0
  28. mnemebrain_benchmark-0.1.0a1/src/mnemebrain_benchmark/adapters/structured_memory.py +154 -0
  29. mnemebrain_benchmark-0.1.0a1/src/mnemebrain_benchmark/bmb_cli.py +220 -0
  30. mnemebrain_benchmark-0.1.0a1/src/mnemebrain_benchmark/data/claim_pairs.json +602 -0
  31. mnemebrain_benchmark-0.1.0a1/src/mnemebrain_benchmark/dataset.py +95 -0
  32. mnemebrain_benchmark-0.1.0a1/src/mnemebrain_benchmark/interface.py +146 -0
  33. mnemebrain_benchmark-0.1.0a1/src/mnemebrain_benchmark/metrics.py +129 -0
  34. mnemebrain_benchmark-0.1.0a1/src/mnemebrain_benchmark/protocols.py +14 -0
  35. mnemebrain_benchmark-0.1.0a1/src/mnemebrain_benchmark/runner.py +207 -0
  36. mnemebrain_benchmark-0.1.0a1/src/mnemebrain_benchmark/scenarios/__init__.py +0 -0
  37. mnemebrain_benchmark-0.1.0a1/src/mnemebrain_benchmark/scenarios/data/bmb_scenarios.json +2640 -0
  38. mnemebrain_benchmark-0.1.0a1/src/mnemebrain_benchmark/scenarios/data/scenarios.json +1069 -0
  39. mnemebrain_benchmark-0.1.0a1/src/mnemebrain_benchmark/scenarios/loader.py +93 -0
  40. mnemebrain_benchmark-0.1.0a1/src/mnemebrain_benchmark/scenarios/schema.py +73 -0
  41. mnemebrain_benchmark-0.1.0a1/src/mnemebrain_benchmark/scoring.py +307 -0
  42. mnemebrain_benchmark-0.1.0a1/src/mnemebrain_benchmark/system_cli.py +94 -0
  43. mnemebrain_benchmark-0.1.0a1/src/mnemebrain_benchmark/system_report.py +95 -0
  44. mnemebrain_benchmark-0.1.0a1/src/mnemebrain_benchmark/system_runner.py +167 -0
  45. mnemebrain_benchmark-0.1.0a1/src/mnemebrain_benchmark/task_evals/__init__.py +1 -0
  46. mnemebrain_benchmark-0.1.0a1/src/mnemebrain_benchmark/task_evals/__main__.py +118 -0
  47. mnemebrain_benchmark-0.1.0a1/src/mnemebrain_benchmark/task_evals/base.py +61 -0
  48. mnemebrain_benchmark-0.1.0a1/src/mnemebrain_benchmark/task_evals/data/preference_scenarios.json +538 -0
  49. mnemebrain_benchmark-0.1.0a1/src/mnemebrain_benchmark/task_evals/data/qa_scenarios.json +449 -0
  50. mnemebrain_benchmark-0.1.0a1/src/mnemebrain_benchmark/task_evals/long_horizon_qa.py +49 -0
  51. mnemebrain_benchmark-0.1.0a1/src/mnemebrain_benchmark/task_evals/preference_tracking.py +49 -0
  52. mnemebrain_benchmark-0.1.0a1/src/mnemebrain_benchmark/task_evals/runner.py +129 -0
  53. mnemebrain_benchmark-0.1.0a1/tests/__init__.py +0 -0
  54. mnemebrain_benchmark-0.1.0a1/tests/conftest.py +5 -0
  55. mnemebrain_benchmark-0.1.0a1/tests/helpers.py +46 -0
  56. mnemebrain_benchmark-0.1.0a1/tests/test_cli.py +110 -0
  57. mnemebrain_benchmark-0.1.0a1/tests/test_dataset.py +146 -0
  58. mnemebrain_benchmark-0.1.0a1/tests/test_interface.py +152 -0
  59. mnemebrain_benchmark-0.1.0a1/tests/test_langchain_buffer.py +50 -0
  60. mnemebrain_benchmark-0.1.0a1/tests/test_loader.py +121 -0
  61. mnemebrain_benchmark-0.1.0a1/tests/test_mem0_adapter.py +164 -0
  62. mnemebrain_benchmark-0.1.0a1/tests/test_metrics.py +106 -0
  63. mnemebrain_benchmark-0.1.0a1/tests/test_mnemebrain_adapter.py +250 -0
  64. mnemebrain_benchmark-0.1.0a1/tests/test_naive_baseline.py +60 -0
  65. mnemebrain_benchmark-0.1.0a1/tests/test_openai_rag_adapter.py +111 -0
  66. mnemebrain_benchmark-0.1.0a1/tests/test_protocols.py +26 -0
  67. mnemebrain_benchmark-0.1.0a1/tests/test_rag_baseline.py +63 -0
  68. mnemebrain_benchmark-0.1.0a1/tests/test_runner.py +54 -0
  69. mnemebrain_benchmark-0.1.0a1/tests/test_schema.py +82 -0
  70. mnemebrain_benchmark-0.1.0a1/tests/test_scoring.py +348 -0
  71. mnemebrain_benchmark-0.1.0a1/tests/test_structured_memory.py +133 -0
  72. mnemebrain_benchmark-0.1.0a1/tests/test_system_report.py +77 -0
  73. mnemebrain_benchmark-0.1.0a1/tests/test_system_runner.py +284 -0
  74. mnemebrain_benchmark-0.1.0a1/tests/test_task_evals.py +118 -0
  75. mnemebrain_benchmark-0.1.0a1/tests/test_task_evals_runner.py +108 -0
  76. mnemebrain_benchmark-0.1.0a1/tests/test_task_evals_scenarios.py +134 -0
  77. mnemebrain_benchmark-0.1.0a1/uv.lock +1934 -0
@@ -0,0 +1,66 @@
1
+ name: Bug Report
2
+ description: Report a bug in mnemebrain-benchmark
3
+ labels: ["bug"]
4
+ body:
5
+ - type: markdown
6
+ attributes:
7
+ value: |
8
+ Thanks for reporting a bug. Please fill in the details below.
9
+
10
+ - type: input
11
+ id: version
12
+ attributes:
13
+ label: Package version
14
+ description: Output of `pip show mnemebrain-benchmark | grep Version`
15
+ placeholder: "0.1.0"
16
+ validations:
17
+ required: true
18
+
19
+ - type: input
20
+ id: python
21
+ attributes:
22
+ label: Python version
23
+ description: Output of `python --version`
24
+ placeholder: "3.12.0"
25
+ validations:
26
+ required: true
27
+
28
+ - type: dropdown
29
+ id: component
30
+ attributes:
31
+ label: Component
32
+ options:
33
+ - System Benchmark
34
+ - BMB (Belief Maintenance Benchmark)
35
+ - Embedding Benchmark
36
+ - Task Evaluations
37
+ - Adapter (specify in description)
38
+ - Scoring / Metrics
39
+ - CLI
40
+ - Other
41
+ validations:
42
+ required: true
43
+
44
+ - type: textarea
45
+ id: description
46
+ attributes:
47
+ label: Description
48
+ description: What happened and what did you expect?
49
+ validations:
50
+ required: true
51
+
52
+ - type: textarea
53
+ id: reproduce
54
+ attributes:
55
+ label: Steps to reproduce
56
+ description: Minimal code or commands to reproduce the issue.
57
+ render: python
58
+ validations:
59
+ required: true
60
+
61
+ - type: textarea
62
+ id: logs
63
+ attributes:
64
+ label: Error output
65
+ description: Paste any relevant error messages or tracebacks.
66
+ render: text
@@ -0,0 +1,2 @@
1
+ blank_issues_enabled: false
2
+ contact_links: []
@@ -0,0 +1,44 @@
1
+ name: Feature Request
2
+ description: Suggest a new feature or improvement
3
+ labels: ["enhancement"]
4
+ body:
5
+ - type: markdown
6
+ attributes:
7
+ value: |
8
+ Thanks for suggesting an improvement to mnemebrain-benchmark.
9
+
10
+ - type: dropdown
11
+ id: category
12
+ attributes:
13
+ label: Category
14
+ options:
15
+ - New adapter
16
+ - New benchmark scenario
17
+ - New metric or scoring method
18
+ - CLI improvement
19
+ - Documentation
20
+ - Other
21
+ validations:
22
+ required: true
23
+
24
+ - type: textarea
25
+ id: description
26
+ attributes:
27
+ label: Description
28
+ description: What would you like to see added or changed?
29
+ validations:
30
+ required: true
31
+
32
+ - type: textarea
33
+ id: motivation
34
+ attributes:
35
+ label: Motivation
36
+ description: Why is this useful? What problem does it solve?
37
+ validations:
38
+ required: true
39
+
40
+ - type: textarea
41
+ id: alternatives
42
+ attributes:
43
+ label: Alternatives considered
44
+ description: Have you considered any alternative solutions?
@@ -0,0 +1,61 @@
1
+ name: New Adapter Proposal
2
+ description: Propose a new memory system adapter for the benchmark
3
+ labels: ["adapter", "enhancement"]
4
+ body:
5
+ - type: markdown
6
+ attributes:
7
+ value: |
8
+ Propose adding a new memory system adapter to the benchmark suite.
9
+ See [docs/adding-adapters.md](../../docs/adding-adapters.md) for the implementation guide.
10
+
11
+ - type: input
12
+ id: system_name
13
+ attributes:
14
+ label: Memory system name
15
+ placeholder: "e.g. chromadb, weaviate, pinecone"
16
+ validations:
17
+ required: true
18
+
19
+ - type: checkboxes
20
+ id: capabilities
21
+ attributes:
22
+ label: Capabilities
23
+ description: Which capabilities does this system support?
24
+ options:
25
+ - label: "store"
26
+ - label: "query"
27
+ - label: "retract"
28
+ - label: "explain"
29
+ - label: "contradiction"
30
+ - label: "decay"
31
+ - label: "revise"
32
+ - label: "sandbox"
33
+ - label: "attack"
34
+ - label: "consolidation"
35
+ - label: "hipporag"
36
+ - label: "pattern_separation"
37
+
38
+ - type: dropdown
39
+ id: dependency_type
40
+ attributes:
41
+ label: Dependency type
42
+ options:
43
+ - Local only (no API key needed)
44
+ - Cloud API (requires API key)
45
+ - Hybrid
46
+ validations:
47
+ required: true
48
+
49
+ - type: textarea
50
+ id: description
51
+ attributes:
52
+ label: Description
53
+ description: Brief description of the system and why it's a useful benchmark target.
54
+ validations:
55
+ required: true
56
+
57
+ - type: textarea
58
+ id: implementation_notes
59
+ attributes:
60
+ label: Implementation notes
61
+ description: Any relevant details about the adapter implementation.
@@ -0,0 +1,84 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ concurrency:
10
+ group: ${{ github.workflow }}-${{ github.ref }}
11
+ cancel-in-progress: true
12
+
13
+ jobs:
14
+ quality:
15
+ name: Lint & Type Check
16
+ runs-on: ubuntu-latest
17
+ steps:
18
+ - uses: actions/checkout@v4
19
+ - uses: astral-sh/setup-uv@v4
20
+ with:
21
+ version: "latest"
22
+ - uses: actions/setup-python@v5
23
+ with:
24
+ python-version: "3.12"
25
+
26
+ - name: Install dependencies
27
+ run: uv sync --extra dev
28
+
29
+ - name: Lint with ruff
30
+ run: uv run ruff check src/ tests/
31
+
32
+ - name: Format check with ruff
33
+ run: uv run ruff format --check src/ tests/
34
+
35
+ - name: Type check with mypy
36
+ run: uv run mypy src/mnemebrain_benchmark/
37
+
38
+ test:
39
+ name: Tests (Python ${{ matrix.python-version }})
40
+ needs: quality
41
+ runs-on: ubuntu-latest
42
+ strategy:
43
+ matrix:
44
+ python-version: ["3.12", "3.13"]
45
+ steps:
46
+ - uses: actions/checkout@v4
47
+ - uses: astral-sh/setup-uv@v4
48
+ with:
49
+ version: "latest"
50
+ - uses: actions/setup-python@v5
51
+ with:
52
+ python-version: ${{ matrix.python-version }}
53
+
54
+ - name: Install dependencies
55
+ run: uv sync --extra dev
56
+
57
+ - name: Run tests with coverage
58
+ run: uv run pytest --cov=mnemebrain_benchmark --cov-report=xml --cov-report=term-missing -q
59
+
60
+ - name: Upload coverage
61
+ if: matrix.python-version == '3.12'
62
+ uses: actions/upload-artifact@v4
63
+ with:
64
+ name: coverage-report
65
+ path: coverage.xml
66
+
67
+ coverage-gate:
68
+ name: Coverage Gate
69
+ needs: test
70
+ runs-on: ubuntu-latest
71
+ steps:
72
+ - uses: actions/checkout@v4
73
+ - uses: astral-sh/setup-uv@v4
74
+ with:
75
+ version: "latest"
76
+ - uses: actions/setup-python@v5
77
+ with:
78
+ python-version: "3.12"
79
+
80
+ - name: Install dependencies
81
+ run: uv sync --extra dev
82
+
83
+ - name: Check coverage threshold
84
+ run: uv run pytest --cov=mnemebrain_benchmark --cov-fail-under=80 -q
@@ -0,0 +1,54 @@
1
+ name: "CodeQL Advanced"
2
+
3
+ on:
4
+ push:
5
+ branches: [ "main" ]
6
+ pull_request:
7
+ branches: [ "main" ]
8
+ schedule:
9
+ - cron: '15 0 * * 4'
10
+
11
+ jobs:
12
+ analyze:
13
+ name: Analyze (${{ matrix.language }})
14
+ runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }}
15
+ permissions:
16
+ security-events: write
17
+ packages: read
18
+ actions: read
19
+ contents: read
20
+
21
+ strategy:
22
+ fail-fast: false
23
+ matrix:
24
+ include:
25
+ - language: actions
26
+ build-mode: none
27
+ - language: python
28
+ build-mode: none
29
+
30
+ steps:
31
+ - name: Checkout repository
32
+ uses: actions/checkout@v4
33
+
34
+ - name: Initialize CodeQL
35
+ uses: github/codeql-action/init@v4
36
+ with:
37
+ languages: ${{ matrix.language }}
38
+ build-mode: ${{ matrix.build-mode }}
39
+
40
+ - name: Run manual build steps
41
+ if: matrix.build-mode == 'manual'
42
+ shell: bash
43
+ run: |
44
+ echo 'If you are using a "manual" build mode for one or more of the' \
45
+ 'languages you are analyzing, replace this with the commands to build' \
46
+ 'your code, for example:'
47
+ echo ' make bootstrap'
48
+ echo ' make release'
49
+ exit 1
50
+
51
+ - name: Perform CodeQL Analysis
52
+ uses: github/codeql-action/analyze@v4
53
+ with:
54
+ category: "/language:${{matrix.language}}"
@@ -0,0 +1,19 @@
1
+ name: 'Dependency review'
2
+ on:
3
+ pull_request:
4
+ branches: [ "main" ]
5
+
6
+ permissions:
7
+ contents: read
8
+ pull-requests: write
9
+
10
+ jobs:
11
+ dependency-review:
12
+ runs-on: ubuntu-latest
13
+ steps:
14
+ - name: 'Checkout repository'
15
+ uses: actions/checkout@v4
16
+ - name: 'Dependency Review'
17
+ uses: actions/dependency-review-action@v4
18
+ with:
19
+ comment-summary-in-pr: always
@@ -0,0 +1,31 @@
1
+ name: Pylint
2
+
3
+ on: [push]
4
+
5
+ jobs:
6
+ build:
7
+ runs-on: ubuntu-latest
8
+ strategy:
9
+ matrix:
10
+ python-version: ["3.12", "3.13"]
11
+ steps:
12
+ - uses: actions/checkout@v4
13
+
14
+ - name: Install uv
15
+ uses: astral-sh/setup-uv@v4
16
+ with:
17
+ version: "latest"
18
+
19
+ - name: Set up Python
20
+ env:
21
+ PYTHON_VERSION: ${{ matrix.python-version }}
22
+ run: uv python install "$PYTHON_VERSION"
23
+
24
+ - name: Install dependencies
25
+ run: uv sync --extra dev && uv pip install pylint
26
+
27
+ - name: Analysing the code with pylint
28
+ run: >
29
+ uv run pylint
30
+ --disable=C0103,C0114,C0115,C0116,C0301,C0415,E0401,R0801,R0902,R0903,R0912,R0913,R0914,R0915,R0917,W0603,W0621,W0107,W0613,W0718,W2301
31
+ src/
@@ -0,0 +1,96 @@
1
+ name: Release
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - "v*"
7
+
8
+ permissions:
9
+ contents: write
10
+ id-token: write
11
+ packages: write
12
+
13
+ jobs:
14
+ test:
15
+ runs-on: ubuntu-latest
16
+ strategy:
17
+ matrix:
18
+ python-version: ["3.12", "3.13"]
19
+ steps:
20
+ - uses: actions/checkout@v4
21
+
22
+ - name: Install uv
23
+ uses: astral-sh/setup-uv@v4
24
+ with:
25
+ version: "latest"
26
+
27
+ - name: Set up Python
28
+ env:
29
+ PYTHON_VERSION: ${{ matrix.python-version }}
30
+ run: uv python install "$PYTHON_VERSION"
31
+
32
+ - name: Install dependencies
33
+ run: uv sync --extra dev
34
+
35
+ - name: Run tests
36
+ run: uv run pytest tests/ -v
37
+
38
+ build:
39
+ needs: test
40
+ runs-on: ubuntu-latest
41
+ steps:
42
+ - uses: actions/checkout@v4
43
+
44
+ - name: Install uv
45
+ uses: astral-sh/setup-uv@v4
46
+ with:
47
+ version: "latest"
48
+
49
+ - name: Set up Python
50
+ run: uv python install 3.12
51
+
52
+ - name: Build package
53
+ run: uv build
54
+
55
+ - name: Upload dist artifacts
56
+ uses: actions/upload-artifact@v4
57
+ with:
58
+ name: dist
59
+ path: dist/
60
+
61
+ publish-pypi:
62
+ needs: build
63
+ runs-on: ubuntu-latest
64
+ environment: pypi
65
+ permissions:
66
+ id-token: write
67
+ steps:
68
+ - name: Download dist artifacts
69
+ uses: actions/download-artifact@v4
70
+ with:
71
+ name: dist
72
+ path: dist/
73
+
74
+ - name: Publish to PyPI
75
+ uses: pypa/gh-action-pypi-publish@release/v1
76
+
77
+ publish-github:
78
+ needs: build
79
+ runs-on: ubuntu-latest
80
+ permissions:
81
+ contents: write
82
+ packages: write
83
+ steps:
84
+ - uses: actions/checkout@v4
85
+
86
+ - name: Download dist artifacts
87
+ uses: actions/download-artifact@v4
88
+ with:
89
+ name: dist
90
+ path: dist/
91
+
92
+ - name: Create GitHub Release
93
+ uses: softprops/action-gh-release@v2
94
+ with:
95
+ files: dist/*
96
+ generate_release_notes: true
@@ -0,0 +1,19 @@
1
+ __pycache__/
2
+ *.pyc
3
+ *.pyo
4
+ *.egg-info/
5
+ dist/
6
+ build/
7
+ .venv/
8
+ .env
9
+ .coverage
10
+ coverage.xml
11
+ htmlcov/
12
+ .pytest_cache/
13
+ .mypy_cache/
14
+ .ruff_cache/
15
+ *.json
16
+ !src/mnemebrain_benchmark/scenarios/data/*.json
17
+ !src/mnemebrain_benchmark/task_evals/data/*.json
18
+ !src/mnemebrain_benchmark/data/*.json
19
+ .dirigent/