cordon 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. cordon-0.1.0/.containerignore +75 -0
  2. cordon-0.1.0/.github/dependabot.yml +39 -0
  3. cordon-0.1.0/.github/workflows/ci.yml +123 -0
  4. cordon-0.1.0/.github/workflows/release.yml +108 -0
  5. cordon-0.1.0/.gitignore +215 -0
  6. cordon-0.1.0/.pre-commit-config.yaml +34 -0
  7. cordon-0.1.0/Containerfile +30 -0
  8. cordon-0.1.0/LICENSE +201 -0
  9. cordon-0.1.0/Makefile +106 -0
  10. cordon-0.1.0/PKG-INFO +287 -0
  11. cordon-0.1.0/README.md +260 -0
  12. cordon-0.1.0/docs/CONTAINER.md +185 -0
  13. cordon-0.1.0/docs/architecture.md +340 -0
  14. cordon-0.1.0/docs/examples/README.md +111 -0
  15. cordon-0.1.0/docs/examples/output/android-output.txt +435 -0
  16. cordon-0.1.0/docs/examples/output/apache-output.txt +403 -0
  17. cordon-0.1.0/docs/examples/output/hadoop-output.txt +339 -0
  18. cordon-0.1.0/docs/examples/output/hdfs-output.txt +492 -0
  19. cordon-0.1.0/docs/examples/output/linux-output.txt +305 -0
  20. cordon-0.1.0/docs/examples/output/openssh-output.txt +380 -0
  21. cordon-0.1.0/docs/examples/output/proxifier-output.txt +323 -0
  22. cordon-0.1.0/docs/examples/output/spark-output.txt +409 -0
  23. cordon-0.1.0/docs/examples/output/windows-output.txt +377 -0
  24. cordon-0.1.0/docs/examples/output/zookeeper-output.txt +379 -0
  25. cordon-0.1.0/docs/llama-cpp.md +165 -0
  26. cordon-0.1.0/examples/library_usage.py +42 -0
  27. cordon-0.1.0/pyproject.toml +113 -0
  28. cordon-0.1.0/src/cordon/__init__.py +14 -0
  29. cordon-0.1.0/src/cordon/analysis/__init__.py +4 -0
  30. cordon-0.1.0/src/cordon/analysis/scorer.py +256 -0
  31. cordon-0.1.0/src/cordon/analysis/thresholder.py +51 -0
  32. cordon-0.1.0/src/cordon/cli.py +230 -0
  33. cordon-0.1.0/src/cordon/core/__init__.py +19 -0
  34. cordon-0.1.0/src/cordon/core/config.py +64 -0
  35. cordon-0.1.0/src/cordon/core/types.py +141 -0
  36. cordon-0.1.0/src/cordon/embedding/__init__.py +29 -0
  37. cordon-0.1.0/src/cordon/embedding/llama_cpp.py +95 -0
  38. cordon-0.1.0/src/cordon/embedding/transformer.py +135 -0
  39. cordon-0.1.0/src/cordon/ingestion/__init__.py +3 -0
  40. cordon-0.1.0/src/cordon/ingestion/reader.py +45 -0
  41. cordon-0.1.0/src/cordon/pipeline.py +126 -0
  42. cordon-0.1.0/src/cordon/postprocess/__init__.py +4 -0
  43. cordon-0.1.0/src/cordon/postprocess/formatter.py +68 -0
  44. cordon-0.1.0/src/cordon/postprocess/merger.py +77 -0
  45. cordon-0.1.0/src/cordon/py.typed +2 -0
  46. cordon-0.1.0/src/cordon/segmentation/__init__.py +3 -0
  47. cordon-0.1.0/src/cordon/segmentation/windower.py +80 -0
  48. cordon-0.1.0/tests/__init__.py +0 -0
  49. cordon-0.1.0/tests/test_analysis.py +225 -0
  50. cordon-0.1.0/tests/test_core.py +88 -0
  51. cordon-0.1.0/tests/test_ingestion.py +98 -0
  52. cordon-0.1.0/tests/test_integration.py +126 -0
  53. cordon-0.1.0/tests/test_llama_cpp.py +265 -0
  54. cordon-0.1.0/tests/test_postprocess.py +186 -0
  55. cordon-0.1.0/tests/test_segmentation.py +92 -0
  56. cordon-0.1.0/tests/test_transformer.py +181 -0
@@ -0,0 +1,75 @@
1
+ # .containerignore - Exclude files from container build context
2
+ # This reduces build context size and speeds up container builds
3
+
4
+ # Python cache and compiled files
5
+ __pycache__/
6
+ *.py[cod]
7
+ *$py.class
8
+ *.so
9
+
10
+ # Virtual environments
11
+ .venv/
12
+ venv/
13
+ ENV/
14
+ env/
15
+ .virtualenv/
16
+
17
+ # Distribution / packaging
18
+ build/
19
+ dist/
20
+ *.egg-info/
21
+ .eggs/
22
+ wheels/
23
+
24
+ # Testing
25
+ .pytest_cache/
26
+ .coverage
27
+ htmlcov/
28
+ .tox/
29
+ .nox/
30
+
31
+ # Development tools
32
+ .mypy_cache/
33
+ .ruff_cache/
34
+ .pre-commit-config.yaml
35
+ .git/
36
+ .github/
37
+ .gitlab/
38
+ .gitignore
39
+
40
+ # IDE and editor files
41
+ .vscode/
42
+ .idea/
43
+ *.swp
44
+ *.swo
45
+ *~
46
+ .DS_Store
47
+
48
+ # Documentation build artifacts
49
+ docs/_build/
50
+ docs/.doctrees/
51
+
52
+ # Container files (don't copy container into itself)
53
+ Containerfile
54
+ Dockerfile
55
+ .dockerignore
56
+ .containerignore
57
+ *.tar
58
+ *.tar.gz
59
+
60
+ # Logs and temporary files
61
+ *.log
62
+ logs/
63
+ tmp/
64
+ temp/
65
+
66
+ # CI/CD
67
+ .github/
68
+ .gitlab-ci.yml
69
+
70
+ # Project-specific
71
+ examples/
72
+ tests/
73
+ *.md
74
+ !README.md
75
+ !LICENSE
@@ -0,0 +1,39 @@
1
+ version: 2
2
+ updates:
3
+ # GitHub Actions updates
4
+ - package-ecosystem: "github-actions"
5
+ directory: "/"
6
+ schedule:
7
+ interval: "weekly"
8
+ day: "monday"
9
+ labels:
10
+ - "dependencies"
11
+ - "github-actions"
12
+ groups:
13
+ actions:
14
+ patterns:
15
+ - "*"
16
+
17
+ # Python dependencies updates
18
+ - package-ecosystem: "pip"
19
+ directory: "/"
20
+ schedule:
21
+ interval: "weekly"
22
+ day: "monday"
23
+ labels:
24
+ - "dependencies"
25
+ - "python"
26
+ groups:
27
+ python-dependencies:
28
+ patterns:
29
+ - "*"
30
+ exclude-patterns:
31
+ - "pytest*"
32
+ - "mypy"
33
+ - "ruff"
34
+ dev-dependencies:
35
+ patterns:
36
+ - "pytest*"
37
+ - "mypy"
38
+ - "ruff"
39
+ - "pre-commit"
@@ -0,0 +1,123 @@
1
+ name: CI
2
+
3
+ on:
4
+ pull_request:
5
+ push:
6
+ branches:
7
+ - main
8
+
9
+ jobs:
10
+ test:
11
+ name: Test (Python ${{ matrix.python-version }})
12
+ runs-on: ubuntu-latest
13
+ strategy:
14
+ fail-fast: false
15
+ matrix:
16
+ python-version: ["3.10", "3.11", "3.12", "3.13"]
17
+
18
+ steps:
19
+ - name: Checkout code
20
+ uses: actions/checkout@v4
21
+
22
+ - name: Set up Python ${{ matrix.python-version }}
23
+ uses: actions/setup-python@v5
24
+ with:
25
+ python-version: ${{ matrix.python-version }}
26
+
27
+ - name: Install uv
28
+ uses: astral-sh/setup-uv@v4
29
+
30
+ - name: Install dependencies
31
+ run: uv pip install --system -e ".[dev]"
32
+
33
+ - name: Run tests with coverage
34
+ env:
35
+ OMP_NUM_THREADS: 1
36
+ run: pytest tests/ --cov=cordon --cov-report=term-missing --cov-report=html
37
+
38
+ - name: Upload coverage report
39
+ if: matrix.python-version == '3.12'
40
+ uses: actions/upload-artifact@v4
41
+ with:
42
+ name: coverage-report
43
+ path: htmlcov/
44
+ retention-days: 30
45
+
46
+ pre-commit:
47
+ name: Pre-commit checks
48
+ runs-on: ubuntu-latest
49
+
50
+ steps:
51
+ - name: Checkout code
52
+ uses: actions/checkout@v4
53
+
54
+ - name: Set up Python 3.12
55
+ uses: actions/setup-python@v5
56
+ with:
57
+ python-version: "3.12"
58
+
59
+ - name: Install uv
60
+ uses: astral-sh/setup-uv@v4
61
+
62
+ - name: Install dependencies
63
+ run: uv pip install --system -e ".[dev]"
64
+
65
+ - name: Run pre-commit
66
+ run: pre-commit run --all-files
67
+
68
+ container:
69
+ name: Container build
70
+ runs-on: ubuntu-latest
71
+ permissions:
72
+ contents: read
73
+ packages: write
74
+ attestations: write
75
+ id-token: write
76
+
77
+ steps:
78
+ - name: Checkout code
79
+ uses: actions/checkout@v4
80
+
81
+ - name: Set up QEMU
82
+ uses: docker/setup-qemu-action@v3
83
+
84
+ - name: Set up Docker Buildx
85
+ uses: docker/setup-buildx-action@v3
86
+
87
+ - name: Log in to GitHub Container Registry
88
+ if: github.event_name == 'push' && github.ref == 'refs/heads/main'
89
+ uses: docker/login-action@v3
90
+ with:
91
+ registry: ghcr.io
92
+ username: ${{ github.actor }}
93
+ password: ${{ secrets.GITHUB_TOKEN }}
94
+
95
+ - name: Extract metadata
96
+ id: meta
97
+ uses: docker/metadata-action@v5
98
+ with:
99
+ images: ghcr.io/${{ github.repository }}
100
+ tags: |
101
+ type=ref,event=pr
102
+ type=raw,value=dev,enable=${{ github.ref == 'refs/heads/main' }}
103
+
104
+ - name: Build and push container image
105
+ id: push
106
+ uses: docker/build-push-action@v6
107
+ with:
108
+ context: .
109
+ file: Containerfile
110
+ platforms: ${{ github.event_name == 'pull_request' && 'linux/amd64' || 'linux/amd64,linux/arm64' }}
111
+ push: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }}
112
+ tags: ${{ steps.meta.outputs.tags }}
113
+ labels: ${{ steps.meta.outputs.labels }}
114
+ cache-from: type=gha
115
+ cache-to: ${{ github.ref == 'refs/heads/main' && 'type=gha,mode=min' || '' }}
116
+
117
+ - name: Generate artifact attestation
118
+ if: github.event_name == 'push' && github.ref == 'refs/heads/main'
119
+ uses: actions/attest-build-provenance@v2
120
+ with:
121
+ subject-name: ghcr.io/${{ github.repository }}
122
+ subject-digest: ${{ steps.push.outputs.digest }}
123
+ push-to-registry: true
@@ -0,0 +1,108 @@
1
+ name: Release
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+
7
+ jobs:
8
+ build:
9
+ name: Build Python package
10
+ runs-on: ubuntu-latest
11
+
12
+ steps:
13
+ - name: Checkout code
14
+ uses: actions/checkout@v4
15
+
16
+ - name: Set up Python 3.12
17
+ uses: actions/setup-python@v5
18
+ with:
19
+ python-version: "3.12"
20
+
21
+ - name: Install uv
22
+ uses: astral-sh/setup-uv@v4
23
+
24
+ - name: Install build tools
25
+ run: uv pip install --system build
26
+
27
+ - name: Build distributions
28
+ run: python -m build
29
+
30
+ - name: Upload distributions
31
+ uses: actions/upload-artifact@v4
32
+ with:
33
+ name: release-dists
34
+ path: dist/
35
+
36
+ pypi-publish:
37
+ name: Publish to PyPI
38
+ runs-on: ubuntu-latest
39
+ needs: build
40
+ permissions:
41
+ id-token: write
42
+
43
+ steps:
44
+ - name: Download distributions
45
+ uses: actions/download-artifact@v4
46
+ with:
47
+ name: release-dists
48
+ path: dist/
49
+
50
+ - name: Publish to PyPI
51
+ uses: pypa/gh-action-pypi-publish@release/v1
52
+
53
+ container-release:
54
+ name: Release container image
55
+ runs-on: ubuntu-latest
56
+ permissions:
57
+ contents: read
58
+ packages: write
59
+ attestations: write
60
+ id-token: write
61
+
62
+ steps:
63
+ - name: Checkout code
64
+ uses: actions/checkout@v4
65
+
66
+ - name: Set up QEMU
67
+ uses: docker/setup-qemu-action@v3
68
+
69
+ - name: Set up Docker Buildx
70
+ uses: docker/setup-buildx-action@v3
71
+
72
+ - name: Log in to GitHub Container Registry
73
+ uses: docker/login-action@v3
74
+ with:
75
+ registry: ghcr.io
76
+ username: ${{ github.actor }}
77
+ password: ${{ secrets.GITHUB_TOKEN }}
78
+
79
+ - name: Extract metadata
80
+ id: meta
81
+ uses: docker/metadata-action@v5
82
+ with:
83
+ images: ghcr.io/${{ github.repository }}
84
+ tags: |
85
+ type=semver,pattern={{version}}
86
+ type=semver,pattern={{major}}.{{minor}}
87
+ type=semver,pattern={{major}}
88
+ type=raw,value=latest
89
+
90
+ - name: Build and push container image
91
+ id: push
92
+ uses: docker/build-push-action@v6
93
+ with:
94
+ context: .
95
+ file: Containerfile
96
+ platforms: linux/amd64,linux/arm64
97
+ push: true
98
+ tags: ${{ steps.meta.outputs.tags }}
99
+ labels: ${{ steps.meta.outputs.labels }}
100
+ cache-from: type=gha
101
+ cache-to: type=gha,mode=min
102
+
103
+ - name: Generate artifact attestation
104
+ uses: actions/attest-build-provenance@v2
105
+ with:
106
+ subject-name: ghcr.io/${{ github.repository }}
107
+ subject-digest: ${{ steps.push.outputs.digest }}
108
+ push-to-registry: true
@@ -0,0 +1,215 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[codz]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py.cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # UV
98
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ #uv.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ #poetry.lock
109
+ #poetry.toml
110
+
111
+ # pdm
112
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
113
+ # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
114
+ # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
115
+ #pdm.lock
116
+ #pdm.toml
117
+ .pdm-python
118
+ .pdm-build/
119
+
120
+ # pixi
121
+ # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
122
+ #pixi.lock
123
+ # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
124
+ # in the .venv directory. It is recommended not to include this directory in version control.
125
+ .pixi
126
+
127
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
128
+ __pypackages__/
129
+
130
+ # Celery stuff
131
+ celerybeat-schedule
132
+ celerybeat.pid
133
+
134
+ # SageMath parsed files
135
+ *.sage.py
136
+
137
+ # Environments
138
+ .env
139
+ .envrc
140
+ .venv
141
+ env/
142
+ venv/
143
+ ENV/
144
+ env.bak/
145
+ venv.bak/
146
+
147
+ # Spyder project settings
148
+ .spyderproject
149
+ .spyproject
150
+
151
+ # Rope project settings
152
+ .ropeproject
153
+
154
+ # mkdocs documentation
155
+ /site
156
+
157
+ # mypy
158
+ .mypy_cache/
159
+ .dmypy.json
160
+ dmypy.json
161
+
162
+ # Pyre type checker
163
+ .pyre/
164
+
165
+ # pytype static type analyzer
166
+ .pytype/
167
+
168
+ # Cython debug symbols
169
+ cython_debug/
170
+
171
+ # PyCharm
172
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
173
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
174
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
175
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
176
+ #.idea/
177
+
178
+ # Abstra
179
+ # Abstra is an AI-powered process automation framework.
180
+ # Ignore directories containing user credentials, local state, and settings.
181
+ # Learn more at https://abstra.io/docs
182
+ .abstra/
183
+
184
+ # Visual Studio Code
185
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
186
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
187
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
188
+ # you could uncomment the following to ignore the entire vscode folder
189
+ # .vscode/
190
+
191
+ # Ruff stuff:
192
+ .ruff_cache/
193
+
194
+ # PyPI configuration file
195
+ .pypirc
196
+
197
+ # Cursor
198
+ # Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
199
+ # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
200
+ # refer to https://docs.cursor.com/context/ignore-files
201
+ .cursorignore
202
+ .cursorindexingignore
203
+
204
+ # Marimo
205
+ marimo/_static/
206
+ marimo/_lsp/
207
+ __marimo__/
208
+
209
+ # VSCode
210
+ .vscode/
211
+
212
+ # GGUF model files (large binary files)
213
+ # Keep the models/ directory structure but ignore downloaded models
214
+ models/*.gguf
215
+ models/*/*.gguf
@@ -0,0 +1,34 @@
1
+ repos:
2
+ - repo: https://github.com/pre-commit/pre-commit-hooks
3
+ rev: v6.0.0
4
+ hooks:
5
+ - id: trailing-whitespace
6
+ - id: end-of-file-fixer
7
+ - id: check-yaml
8
+ - id: check-added-large-files
9
+ exclude: uv.lock
10
+ - id: check-merge-conflict
11
+ - id: check-toml
12
+ - id: debug-statements
13
+ - id: detect-private-key
14
+
15
+ - repo: https://github.com/astral-sh/ruff-pre-commit
16
+ rev: v0.8.4
17
+ hooks:
18
+ - id: ruff
19
+ args: [--fix, --exit-non-zero-on-fix]
20
+ - id: ruff-format
21
+
22
+ - repo: https://github.com/pre-commit/mirrors-mypy
23
+ rev: v1.14.0
24
+ hooks:
25
+ - id: mypy
26
+ additional_dependencies:
27
+ - "numpy>=1.24.0"
28
+ - "types-dataclasses"
29
+ files: ^src/
30
+
31
+ - repo: https://github.com/zricethezav/gitleaks
32
+ rev: v8.29.1
33
+ hooks:
34
+ - id: gitleaks
@@ -0,0 +1,30 @@
1
+ FROM python:3.11-slim
2
+
3
+ ENV PYTHONUNBUFFERED=1 \
4
+ PYTHONDONTWRITEBYTECODE=1 \
5
+ TRANSFORMERS_CACHE=/root/.cache/huggingface
6
+
7
+ WORKDIR /app
8
+
9
+ RUN apt-get update && apt-get install -y --no-install-recommends \
10
+ gcc g++ cmake git \
11
+ libvulkan1 vulkan-tools \
12
+ && rm -rf /var/lib/apt/lists/*
13
+
14
+ RUN pip install --no-cache-dir uv
15
+
16
+ COPY pyproject.toml README.md LICENSE ./
17
+ COPY src/ ./src/
18
+
19
+ RUN uv pip install --system -e ".[llama-cpp,faiss-cpu]"
20
+
21
+ RUN CMAKE_ARGS="-DGGML_VULKAN=on" \
22
+ uv pip install --system --no-cache-dir llama-cpp-python
23
+
24
+ RUN python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('all-MiniLM-L6-v2')" && \
25
+ python -c "from huggingface_hub import hf_hub_download; hf_hub_download('second-state/All-MiniLM-L6-v2-Embedding-GGUF', 'all-MiniLM-L6-v2-Q4_K_M.gguf')"
26
+
27
+ WORKDIR /logs
28
+
29
+ ENTRYPOINT ["cordon"]
30
+ CMD ["--help"]