late-interaction-kernels 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. late_interaction_kernels-0.0.1/.github/ISSUE_TEMPLATE/bug_report.yml +54 -0
  2. late_interaction_kernels-0.0.1/.github/ISSUE_TEMPLATE/config.yml +11 -0
  3. late_interaction_kernels-0.0.1/.github/ISSUE_TEMPLATE/feature_request.yml +25 -0
  4. late_interaction_kernels-0.0.1/.github/pull_request_template.md +16 -0
  5. late_interaction_kernels-0.0.1/.github/workflows/ci.yml +79 -0
  6. late_interaction_kernels-0.0.1/.github/workflows/publish.yml +54 -0
  7. late_interaction_kernels-0.0.1/.gitignore +223 -0
  8. late_interaction_kernels-0.0.1/CHANGELOG.md +51 -0
  9. late_interaction_kernels-0.0.1/CONTRIBUTING.md +62 -0
  10. late_interaction_kernels-0.0.1/LICENSE +201 -0
  11. late_interaction_kernels-0.0.1/PKG-INFO +252 -0
  12. late_interaction_kernels-0.0.1/README.md +217 -0
  13. late_interaction_kernels-0.0.1/benchmarks/bench_backward_0_5.py +279 -0
  14. late_interaction_kernels-0.0.1/benchmarks/bench_backward_method.py +177 -0
  15. late_interaction_kernels-0.0.1/benchmarks/bench_backward_unified.py +159 -0
  16. late_interaction_kernels-0.0.1/benchmarks/bench_cached_maxsim.py +216 -0
  17. late_interaction_kernels-0.0.1/benchmarks/bench_decompress_maxsim.py +370 -0
  18. late_interaction_kernels-0.0.1/benchmarks/bench_fastplaid.py +311 -0
  19. late_interaction_kernels-0.0.1/benchmarks/bench_fastplaid_e2e.py +345 -0
  20. late_interaction_kernels-0.0.1/benchmarks/bench_flash_maxsim.py +287 -0
  21. late_interaction_kernels-0.0.1/benchmarks/bench_forward.py +155 -0
  22. late_interaction_kernels-0.0.1/benchmarks/bench_fp8.py +73 -0
  23. late_interaction_kernels-0.0.1/benchmarks/bench_fused_head_train.py +110 -0
  24. late_interaction_kernels-0.0.1/benchmarks/bench_inference_edge.py +263 -0
  25. late_interaction_kernels-0.0.1/benchmarks/bench_lateon.py +207 -0
  26. late_interaction_kernels-0.0.1/benchmarks/bench_normalize.py +98 -0
  27. late_interaction_kernels-0.0.1/benchmarks/bench_pylate_lateon.py +410 -0
  28. late_interaction_kernels-0.0.1/benchmarks/bench_pylate_training.py +113 -0
  29. late_interaction_kernels-0.0.1/docs/benchmarks.md +281 -0
  30. late_interaction_kernels-0.0.1/docs/design.md +182 -0
  31. late_interaction_kernels-0.0.1/docs/packed_training.md +113 -0
  32. late_interaction_kernels-0.0.1/docs/supported_models.md +52 -0
  33. late_interaction_kernels-0.0.1/examples/basic.py +69 -0
  34. late_interaction_kernels-0.0.1/examples/packed_training.py +217 -0
  35. late_interaction_kernels-0.0.1/late_interaction_kernels/__init__.py +206 -0
  36. late_interaction_kernels-0.0.1/late_interaction_kernels/_autotune.py +129 -0
  37. late_interaction_kernels-0.0.1/late_interaction_kernels/_utils.py +50 -0
  38. late_interaction_kernels-0.0.1/late_interaction_kernels/autograd.py +244 -0
  39. late_interaction_kernels-0.0.1/late_interaction_kernels/backward.py +274 -0
  40. late_interaction_kernels-0.0.1/late_interaction_kernels/backward_csr.py +202 -0
  41. late_interaction_kernels-0.0.1/late_interaction_kernels/backward_unified.py +277 -0
  42. late_interaction_kernels-0.0.1/late_interaction_kernels/experimental/__init__.py +42 -0
  43. late_interaction_kernels-0.0.1/late_interaction_kernels/forward.py +273 -0
  44. late_interaction_kernels-0.0.1/late_interaction_kernels/fp8.py +415 -0
  45. late_interaction_kernels-0.0.1/late_interaction_kernels/fused_head.py +479 -0
  46. late_interaction_kernels-0.0.1/late_interaction_kernels/matryoshka.py +253 -0
  47. late_interaction_kernels-0.0.1/late_interaction_kernels/plaid.py +979 -0
  48. late_interaction_kernels-0.0.1/late_interaction_kernels/py.typed +0 -0
  49. late_interaction_kernels-0.0.1/late_interaction_kernels/pylate_compat.py +213 -0
  50. late_interaction_kernels-0.0.1/late_interaction_kernels/reference.py +373 -0
  51. late_interaction_kernels-0.0.1/late_interaction_kernels/retrieve.py +272 -0
  52. late_interaction_kernels-0.0.1/late_interaction_kernels/scatter.py +202 -0
  53. late_interaction_kernels-0.0.1/late_interaction_kernels/smooth.py +639 -0
  54. late_interaction_kernels-0.0.1/late_interaction_kernels/soft.py +328 -0
  55. late_interaction_kernels-0.0.1/late_interaction_kernels/topk.py +91 -0
  56. late_interaction_kernels-0.0.1/late_interaction_kernels/varlen.py +490 -0
  57. late_interaction_kernels-0.0.1/late_interaction_kernels/xtr.py +87 -0
  58. late_interaction_kernels-0.0.1/pyproject.toml +137 -0
  59. late_interaction_kernels-0.0.1/scripts/run_all_benchmarks.sh +37 -0
  60. late_interaction_kernels-0.0.1/scripts/sky_decompress_bench.yaml +51 -0
  61. late_interaction_kernels-0.0.1/scripts/sky_fastplaid_e2e.yaml +58 -0
  62. late_interaction_kernels-0.0.1/scripts/sky_lateon_edge.yaml +97 -0
  63. late_interaction_kernels-0.0.1/scripts/sky_test.yaml +32 -0
  64. late_interaction_kernels-0.0.1/tests/conftest.py +40 -0
  65. late_interaction_kernels-0.0.1/tests/test_backward.py +396 -0
  66. late_interaction_kernels-0.0.1/tests/test_backward_unified.py +226 -0
  67. late_interaction_kernels-0.0.1/tests/test_edge_cases.py +72 -0
  68. late_interaction_kernels-0.0.1/tests/test_forward.py +102 -0
  69. late_interaction_kernels-0.0.1/tests/test_fp8.py +147 -0
  70. late_interaction_kernels-0.0.1/tests/test_fused_head.py +171 -0
  71. late_interaction_kernels-0.0.1/tests/test_fused_head_train.py +166 -0
  72. late_interaction_kernels-0.0.1/tests/test_matryoshka.py +76 -0
  73. late_interaction_kernels-0.0.1/tests/test_normalize.py +117 -0
  74. late_interaction_kernels-0.0.1/tests/test_plaid.py +379 -0
  75. late_interaction_kernels-0.0.1/tests/test_pylate_compat.py +266 -0
  76. late_interaction_kernels-0.0.1/tests/test_pylate_compat_warnings.py +177 -0
  77. late_interaction_kernels-0.0.1/tests/test_reference_cpu.py +147 -0
  78. late_interaction_kernels-0.0.1/tests/test_retrieve.py +279 -0
  79. late_interaction_kernels-0.0.1/tests/test_retrieve_cpu.py +434 -0
  80. late_interaction_kernels-0.0.1/tests/test_robustness.py +316 -0
  81. late_interaction_kernels-0.0.1/tests/test_scatter.py +89 -0
  82. late_interaction_kernels-0.0.1/tests/test_smooth.py +181 -0
  83. late_interaction_kernels-0.0.1/tests/test_soft.py +44 -0
  84. late_interaction_kernels-0.0.1/tests/test_topk.py +75 -0
  85. late_interaction_kernels-0.0.1/tests/test_varlen.py +155 -0
  86. late_interaction_kernels-0.0.1/tests/test_xtr.py +60 -0
  87. late_interaction_kernels-0.0.1/uv.lock +4344 -0
@@ -0,0 +1,54 @@
1
+ name: Bug report
2
+ description: A kernel crashes, returns wrong numbers, or is slower than expected.
3
+ labels: ["bug"]
4
+ body:
5
+ - type: textarea
6
+ id: summary
7
+ attributes:
8
+ label: Summary
9
+ description: One sentence — what went wrong?
10
+ validations:
11
+ required: true
12
+
13
+ - type: textarea
14
+ id: env
15
+ attributes:
16
+ label: Environment
17
+ description: Output of the commands below.
18
+ placeholder: |
19
+ python -c "import torch; print('torch', torch.__version__, 'cuda', torch.version.cuda)"
20
+ python -c "import triton; print('triton', triton.__version__)"
21
+ python -c "import late_interaction_kernels as lik; print('lik', lik.__version__)"
22
+ python -c "import pylate; print('pylate', pylate.__version__)" # if PyLate-related
23
+ nvidia-smi | head -5
24
+ render: shell
25
+ validations:
26
+ required: true
27
+
28
+ - type: textarea
29
+ id: repro
30
+ attributes:
31
+ label: Minimal reproducer
32
+ description: |
33
+ < 30 lines of runnable Python. For PyLate issues, include `patch_pylate()`
34
+ and one loss / scoring call. For perf issues, adapt one of `benchmarks/bench_*.py`.
35
+ render: python
36
+ validations:
37
+ required: true
38
+
39
+ - type: textarea
40
+ id: shape
41
+ attributes:
42
+ label: Shape / dtype
43
+ description: "(Nq, Nd, Lq, Ld, d), dtype, mask usage, backward method."
44
+ placeholder: "Nq=32, Nd=32, Lq=32, Ld=300, d=128, fp16, d_mask=True, auto"
45
+
46
+ - type: textarea
47
+ id: observed
48
+ attributes:
49
+ label: Observed vs expected
50
+ description: |
51
+ Tracebacks, numerical diffs, or perf numbers. For PyLate issues,
52
+ include the parity delta vs `LIK_DISABLE=1` (kill-switch path).
53
+ validations:
54
+ required: true
@@ -0,0 +1,11 @@
1
+ blank_issues_enabled: false
2
+ contact_links:
3
+ - name: API reference
4
+ url: https://github.com/hcompai/late-interaction-kernels/blob/main/README.md#api
5
+ about: Public surface — patch_pylate(), MaxSimScorer, retrieve, maxsim_varlen, etc.
6
+ - name: Supported models
7
+ url: https://github.com/hcompai/late-interaction-kernels/blob/main/docs/supported_models.md
8
+ about: Which ColBERT / ColPali / ModernColBERT / LateOn-Code / mxbai-edge models we accelerate today.
9
+ - name: Packed / varlen training
10
+ url: https://github.com/hcompai/late-interaction-kernels/blob/main/docs/packed_training.md
11
+ about: Cookbook for wiring maxsim_varlen into a heterogeneous-length training loop.
@@ -0,0 +1,25 @@
1
+ name: Feature request
2
+ description: Propose a new kernel variant, API change, or integration.
3
+ labels: ["enhancement"]
4
+ body:
5
+ - type: textarea
6
+ id: motivation
7
+ attributes:
8
+ label: Motivation
9
+ description: What problem does this solve? Who's hitting it?
10
+ validations:
11
+ required: true
12
+
13
+ - type: textarea
14
+ id: proposal
15
+ attributes:
16
+ label: Proposal
17
+ description: Sketch API / behavior. Pseudo-code is great.
18
+ validations:
19
+ required: true
20
+
21
+ - type: textarea
22
+ id: alternatives
23
+ attributes:
24
+ label: Alternatives considered
25
+ description: Why not solve this outside the library?
@@ -0,0 +1,16 @@
1
+ ## Description
2
+
3
+ <!-- One or two sentences. Link any related issue. -->
4
+
5
+ ## Approach
6
+
7
+ <!-- Brief notes on the implementation. -->
8
+
9
+ ## Test plan
10
+
11
+ <!-- How did you verify correctness? Shapes, GPU, numbers. -->
12
+
13
+ - [ ] `ruff check . && ruff format --check .` and `pytest -q` pass
14
+ - [ ] Parity vs `reference.maxsim_reference` holds for new numerical paths
15
+ - [ ] Benchmarks included for performance-motivated changes
16
+ - [ ] `CHANGELOG.md` and README updated for public API changes
@@ -0,0 +1,79 @@
1
+ name: CI
2
+
3
+ permissions:
4
+ contents: read
5
+
6
+ on:
7
+ pull_request:
8
+ paths:
9
+ - "late_interaction_kernels/**"
10
+ - "tests/**"
11
+ - "pyproject.toml"
12
+ - "uv.lock"
13
+ - ".github/workflows/ci.yml"
14
+ push:
15
+ branches: [main]
16
+ workflow_dispatch:
17
+
18
+ concurrency:
19
+ group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref == 'refs/heads/main' && github.sha || '' }}
20
+ cancel-in-progress: true
21
+
22
+ jobs:
23
+ lint:
24
+ name: Lint (ruff)
25
+ runs-on: ubuntu-latest
26
+ steps:
27
+ - uses: actions/checkout@v4
28
+ - name: Install uv
29
+ uses: astral-sh/setup-uv@v5
30
+ with:
31
+ enable-cache: true
32
+ prune-cache: false
33
+ - name: Install dependencies
34
+ run: uv sync --frozen --extra dev
35
+ - name: Run ruff linter
36
+ run: uv run ruff check
37
+ - name: Run ruff formatter check
38
+ run: uv run ruff format --check
39
+
40
+ typecheck:
41
+ name: Type Check (ty)
42
+ runs-on: ubuntu-latest
43
+ steps:
44
+ - uses: actions/checkout@v4
45
+ - name: Install uv
46
+ uses: astral-sh/setup-uv@v5
47
+ with:
48
+ enable-cache: true
49
+ prune-cache: false
50
+ - name: Install dependencies
51
+ run: uv sync --frozen --extra dev
52
+ - name: Run ty type checker
53
+ run: uv run ty check --output-format github
54
+
55
+ cpu-smoke:
56
+ name: CPU smoke (py${{ matrix.python-version }})
57
+ runs-on: ubuntu-latest
58
+ strategy:
59
+ fail-fast: false
60
+ matrix:
61
+ python-version: ["3.10", "3.11", "3.12"]
62
+ steps:
63
+ - uses: actions/checkout@v4
64
+ - name: Install uv
65
+ uses: astral-sh/setup-uv@v5
66
+ with:
67
+ enable-cache: true
68
+ prune-cache: false
69
+ - name: Install dependencies
70
+ run: uv sync --frozen --extra dev --python ${{ matrix.python-version }}
71
+ - name: Import check
72
+ run: |
73
+ uv run --python ${{ matrix.python-version }} python -c "import late_interaction_kernels as lik; print(lik.__version__)"
74
+ uv run --python ${{ matrix.python-version }} python -c "from late_interaction_kernels.reference import maxsim_reference, maxsim_reference_soft, maxsim_reference_varlen; print('reference ok')"
75
+ - name: Run CPU-safe tests (CUDA tests auto-skip)
76
+ run: uv run --python ${{ matrix.python-version }} pytest -q
77
+
78
+ # GPU tests live on SkyPilot — run manually or via a scheduled dispatch.
79
+ # See scripts/sky_test.yaml.
@@ -0,0 +1,54 @@
1
+ name: Publish Python Package
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+
7
+ permissions:
8
+ contents: read
9
+
10
+ jobs:
11
+ release-build:
12
+ runs-on: ubuntu-latest
13
+
14
+ steps:
15
+ - uses: actions/checkout@v6
16
+
17
+ - uses: actions/setup-python@v6
18
+ with:
19
+ python-version: "3.x"
20
+
21
+ - name: Build release distributions
22
+ run: |
23
+ python -m pip install build
24
+ python -m build
25
+
26
+ - name: Upload distributions
27
+ uses: actions/upload-artifact@v6
28
+ with:
29
+ name: release-dists
30
+ path: dist/
31
+
32
+ pypi-publish:
33
+ runs-on: ubuntu-latest
34
+
35
+ needs:
36
+ - release-build
37
+
38
+ permissions:
39
+ # IMPORTANT: this permission is mandatory for trusted publishing
40
+ id-token: write
41
+
42
+ environment:
43
+ name: pypi
44
+ url: https://pypi.org/project/late-interaction-kernels/
45
+
46
+ steps:
47
+ - name: Retrieve release distributions
48
+ uses: actions/download-artifact@v7
49
+ with:
50
+ name: release-dists
51
+ path: dist/
52
+
53
+ - name: Publish release distributions to PyPI
54
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,223 @@
1
+ # Custom
2
+ .DS_Store
3
+
4
+ # Byte-compiled / optimized / DLL files
5
+ __pycache__/
6
+ *.py[codz]
7
+ *$py.class
8
+
9
+ # C extensions
10
+ *.so
11
+
12
+ # Distribution / packaging
13
+ .Python
14
+ build/
15
+ develop-eggs/
16
+ dist/
17
+ downloads/
18
+ eggs/
19
+ .eggs/
20
+ lib/
21
+ lib64/
22
+ parts/
23
+ sdist/
24
+ var/
25
+ wheels/
26
+ share/python-wheels/
27
+ *.egg-info/
28
+ .installed.cfg
29
+ *.egg
30
+ MANIFEST
31
+
32
+ # PyInstaller
33
+ # Usually these files are written by a python script from a template
34
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
35
+ *.manifest
36
+ *.spec
37
+
38
+ # Installer logs
39
+ pip-log.txt
40
+ pip-delete-this-directory.txt
41
+
42
+ # Unit test / coverage reports
43
+ htmlcov/
44
+ .tox/
45
+ .nox/
46
+ .coverage
47
+ .coverage.*
48
+ .cache
49
+ nosetests.xml
50
+ coverage.xml
51
+ *.cover
52
+ *.py.cover
53
+ *.lcov
54
+ .hypothesis/
55
+ .pytest_cache/
56
+ cover/
57
+
58
+ # Translations
59
+ *.mo
60
+ *.pot
61
+
62
+ # Django stuff:
63
+ *.log
64
+ local_settings.py
65
+ db.sqlite3
66
+ db.sqlite3-journal
67
+
68
+ # Flask stuff:
69
+ instance/
70
+ .webassets-cache
71
+
72
+ # Scrapy stuff:
73
+ .scrapy
74
+
75
+ # Sphinx documentation
76
+ docs/_build/
77
+
78
+ # PyBuilder
79
+ .pybuilder/
80
+ target/
81
+
82
+ # Jupyter Notebook
83
+ .ipynb_checkpoints
84
+
85
+ # IPython
86
+ profile_default/
87
+ ipython_config.py
88
+
89
+ # pyenv
90
+ # For a library or package, you might want to ignore these files since the code is
91
+ # intended to run in multiple environments; otherwise, check them in:
92
+ # .python-version
93
+
94
+ # pipenv
95
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
96
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
97
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
98
+ # install all needed dependencies.
99
+ # Pipfile.lock
100
+
101
+ # UV
102
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
103
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
104
+ # commonly ignored for libraries.
105
+ # uv.lock
106
+
107
+ # poetry
108
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
109
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
110
+ # commonly ignored for libraries.
111
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
112
+ # poetry.lock
113
+ # poetry.toml
114
+
115
+ # pdm
116
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
117
+ # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
118
+ # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
119
+ # pdm.lock
120
+ # pdm.toml
121
+ .pdm-python
122
+ .pdm-build/
123
+
124
+ # pixi
125
+ # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
126
+ # pixi.lock
127
+ # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
128
+ # in the .venv directory. It is recommended not to include this directory in version control.
129
+ .pixi/*
130
+ !.pixi/config.toml
131
+
132
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
133
+ __pypackages__/
134
+
135
+ # Celery stuff
136
+ celerybeat-schedule*
137
+ celerybeat.pid
138
+
139
+ # Redis
140
+ *.rdb
141
+ *.aof
142
+ *.pid
143
+
144
+ # RabbitMQ
145
+ mnesia/
146
+ rabbitmq/
147
+ rabbitmq-data/
148
+
149
+ # ActiveMQ
150
+ activemq-data/
151
+
152
+ # SageMath parsed files
153
+ *.sage.py
154
+
155
+ # Environments
156
+ .env
157
+ .envrc
158
+ .venv
159
+ env/
160
+ venv/
161
+ ENV/
162
+ env.bak/
163
+ venv.bak/
164
+
165
+ # Spyder project settings
166
+ .spyderproject
167
+ .spyproject
168
+
169
+ # Rope project settings
170
+ .ropeproject
171
+
172
+ # mkdocs documentation
173
+ /site
174
+
175
+ # mypy
176
+ .mypy_cache/
177
+ .dmypy.json
178
+ dmypy.json
179
+
180
+ # Pyre type checker
181
+ .pyre/
182
+
183
+ # pytype static type analyzer
184
+ .pytype/
185
+
186
+ # Cython debug symbols
187
+ cython_debug/
188
+
189
+ # PyCharm
190
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
191
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
192
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
193
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
194
+ # .idea/
195
+
196
+ # Abstra
197
+ # Abstra is an AI-powered process automation framework.
198
+ # Ignore directories containing user credentials, local state, and settings.
199
+ # Learn more at https://abstra.io/docs
200
+ .abstra/
201
+
202
+ # Visual Studio Code
203
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
204
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
205
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
206
+ # you could uncomment the following to ignore the entire vscode folder
207
+ # .vscode/
208
+ # Temporary file for partial code execution
209
+ tempCodeRunnerFile.py
210
+
211
+ # Ruff stuff:
212
+ .ruff_cache/
213
+
214
+ # PyPI configuration file
215
+ .pypirc
216
+
217
+ # Marimo
218
+ marimo/_static/
219
+ marimo/_lsp/
220
+ __marimo__/
221
+
222
+ # Streamlit
223
+ .streamlit/secrets.toml
@@ -0,0 +1,51 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented here.
4
+ Format follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/) and the
5
+ project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
6
+
7
+ ## [0.0.1] - 2026-05-02
8
+
9
+ Fused Triton kernels for late-interaction (MaxSim) scoring, with a high-level
10
+ PyTorch API and PyLate drop-in.
11
+
12
+ ### Added
13
+
14
+ - **Core MaxSim kernels** — `maxsim` (autograd-aware) and `maxsim_inference`
15
+ with fused L2-normalize, mask handling, and a `unified` / `csr` / `atomic`
16
+ backward selector (`set_backward_method`, default `auto`).
17
+ - **Ragged / packed batches** — `maxsim_varlen` over `cu_seqlens`-indexed
18
+ flat buffers, autograd-aware on both `Q` and `D`.
19
+ - **Pair-list scoring** — `maxsim_inference_scatter` scores arbitrary
20
+ `(query_index, doc_index)` pairs from packed batches and returns
21
+ `[num_pairs]` directly (vLLM-style reranker scheduling).
22
+ - **Fused D-side head** — `maxsim_from_hidden` (inference) and
23
+ `maxsim_from_hidden_train` (closed-form backward) apply
24
+ projection + L2-normalize + MaxSim in a single pass over raw
25
+ `[Nd, Ld, d_model]` hidden states.
26
+ - **PLAID / ColBERTv2** — `plaid_approx_score` (approximate scoring) and
27
+ `maxsim_residual` / `maxsim_residual_varlen` (exact rerank with on-the-fly
28
+ 2/4/8-bit residual decompression + L2-normalize + MaxSim, forward-only on
29
+ varlen).
30
+ - **FP8 inference** — `maxsim_inference_fp8` with per-tensor / per-token
31
+ e4m3 inputs, fp32 accumulator, and a score-tie fallback harness.
32
+ - **High-level API** — `MaxSimScorer(nn.Module)` and `retrieve(Q, D, top_k)`,
33
+ both with transparent pure-PyTorch CPU fallback so training and retrieval
34
+ code is unit-testable on macOS / Windows / CPU-only CI.
35
+ - **PyLate drop-in** — `patch_pylate` / `unpatch_pylate` patch
36
+ `colbert_scores` and `colbert_kd_scores` across `Contrastive`,
37
+ `CachedContrastive`, and `Distillation`. `LIK_DISABLE=1` is the
38
+ process-wide kill switch.
39
+ - **Experimental kernels** — `late_interaction_kernels.experimental` ships
40
+ `soft_maxsim`, `smooth_maxsim`, `maxsim_xtr`, and `maxsim_matryoshka`.
41
+ - **FP8 helpers** — `late_interaction_kernels.fp8` exposes per-tensor /
42
+ per-token quantize / dequantize utilities.
43
+ - Per-GPU autotune (Hopper / Ampere / Ada / generic) with shared-memory
44
+ pruning; warp specialization on Triton ≥ 3.2 with transparent fallback.
45
+ - Pure-PyTorch reference (`late_interaction_kernels.reference`) used as
46
+ ground truth in tests and as the CPU fallback path.
47
+ - Test suite covering forward / backward parity, varlen, soft/smooth,
48
+ edge cases, PyLate compatibility, CPU fallback, and `gradcheck` on the
49
+ high-level API.
50
+ - Benchmarks for every kernel, plus end-to-end PyLate / LateOn training
51
+ and retrieval scripts under `benchmarks/` and `scripts/`.
@@ -0,0 +1,62 @@
1
+ # Contributing
2
+
3
+ ## Reporting issues
4
+
5
+ Use the **Bug report** or **Feature request** templates under
6
+ [Issues → New issue](https://github.com/hcompai/late-interaction-kernels/issues/new/choose).
7
+
8
+ ## Autotune for a new GPU
9
+
10
+ If performance is poor on a GPU we don't have a shortlist for:
11
+
12
+ 1. Run the benchmark for the shape you care about
13
+ (`benchmarks/bench_forward.py`, `benchmarks/bench_inference_edge.py`,
14
+ `benchmarks/bench_backward_method.py`).
15
+ 2. Add a shortlist in `late_interaction_kernels/_autotune.py` keyed on
16
+ the device-name prefix.
17
+ 3. Re-run the benchmark and include before / after in the PR.
18
+
19
+ ## New kernel variant
20
+
21
+ For a new reduction flavor (e.g. top-K, soft variants), keep it in a
22
+ separate module under `late_interaction_kernels/` and follow the
23
+ existing split:
24
+
25
+ - internal `_forward` returning `(scores, argmax)` without autograd;
26
+ - `torch.autograd.Function` wrapper that saves minimal state;
27
+ - pure-PyTorch reference in `late_interaction_kernels/reference.py`;
28
+ - parity tests in `tests/`.
29
+
30
+ Research kernels with no production user yet land under
31
+ `late_interaction_kernels/experimental/`.
32
+
33
+ ## Development setup
34
+
35
+ ```bash
36
+ git clone https://github.com/hcompai/late-interaction-kernels
37
+ cd late-interaction-kernels
38
+ pip install -e ".[dev,pylate]"
39
+
40
+ ruff check . && ruff format --check .
41
+ pytest -q
42
+ ```
43
+
44
+ ## Style
45
+
46
+ - Python 3.9+; type hints on public APIs.
47
+ - Comments explain *why*, not *what*. Don't narrate trivial code.
48
+ - Match the existing docstring tone — short, concrete, no marketing.
49
+
50
+ ## Publishing a release
51
+
52
+ 1. Ensure `main` is green and `CHANGELOG.md` has the `Unreleased` block filled in.
53
+ 2. On GitHub: **Releases → Draft a new release**, tag `vX.Y.Z` off `main`.
54
+ 3. Paste the matching `CHANGELOG.md` section as the release body, then **Publish**.
55
+
56
+ The [`publish.yml`](.github/workflows/publish.yml) workflow builds and uploads
57
+ to PyPI automatically via OIDC trusted publishing. No token needed.
58
+
59
+ ## License
60
+
61
+ By contributing you agree your work is licensed under Apache 2.0
62
+ (see [`LICENSE`](LICENSE)).