multiregex 2.0.0__tar.gz → 2.0.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {multiregex-2.0.0 → multiregex-2.0.2}/.gitattributes +2 -0
- multiregex-2.0.2/.github/CODEOWNERS +2 -0
- multiregex-2.0.2/.github/ISSUE_TEMPLATE/config.yml +1 -0
- multiregex-2.0.2/.github/ISSUE_TEMPLATE/issue-template.md +6 -0
- multiregex-2.0.0/.github/PULL_REQUEST_TEMPLATE.md → multiregex-2.0.2/.github/pull_request_template.md +3 -1
- multiregex-2.0.2/.github/workflows/build.yml +44 -0
- multiregex-2.0.2/.github/workflows/ci.yml +60 -0
- multiregex-2.0.2/.gitignore +127 -0
- multiregex-2.0.2/.pre-commit-config.yaml +65 -0
- {multiregex-2.0.0 → multiregex-2.0.2}/CHANGELOG.rst +9 -0
- {multiregex-2.0.0 → multiregex-2.0.2}/PKG-INFO +23 -21
- {multiregex-2.0.0 → multiregex-2.0.2}/README.md +12 -10
- {multiregex-2.0.0 → multiregex-2.0.2}/docs/make.bat +1 -1
- {multiregex-2.0.0/src → multiregex-2.0.2}/multiregex/__init__.py +32 -23
- multiregex-2.0.2/multiregex/py.typed +0 -0
- multiregex-2.0.2/multiregex.egg-info/PKG-INFO +125 -0
- multiregex-2.0.2/multiregex.egg-info/SOURCES.txt +35 -0
- multiregex-2.0.2/multiregex.egg-info/dependency_links.txt +1 -0
- multiregex-2.0.2/multiregex.egg-info/requires.txt +1 -0
- multiregex-2.0.2/multiregex.egg-info/top_level.txt +1 -0
- multiregex-2.0.2/pixi.lock +4161 -0
- multiregex-2.0.2/pixi.toml +58 -0
- multiregex-2.0.2/pyproject.toml +82 -0
- multiregex-2.0.2/setup.cfg +4 -0
- {multiregex-2.0.0/src → multiregex-2.0.2/stubs}/ahocorasick.pyi +2 -3
- {multiregex-2.0.0 → multiregex-2.0.2}/tests/test_bench.py +1 -1
- {multiregex-2.0.0 → multiregex-2.0.2}/tests/test_cpython_tests.py +1 -1
- {multiregex-2.0.0 → multiregex-2.0.2}/tests/test_multiregex.py +5 -3
- multiregex-2.0.0/.flake8 +0 -11
- multiregex-2.0.0/.github/CODEOWNERS +0 -1
- multiregex-2.0.0/.github/workflows/ci.yml +0 -68
- multiregex-2.0.0/.pre-commit-config.yaml +0 -28
- multiregex-2.0.0/environment.yml +0 -22
- multiregex-2.0.0/pyproject.toml +0 -60
- multiregex-2.0.0/setup.cfg +0 -31
- multiregex-2.0.0/setup.py +0 -3
- {multiregex-2.0.0 → multiregex-2.0.2}/.github/dependabot.yml +0 -0
- {multiregex-2.0.0 → multiregex-2.0.2}/LICENSE +0 -0
- {multiregex-2.0.0 → multiregex-2.0.2}/docs/Makefile +0 -0
- {multiregex-2.0.0 → multiregex-2.0.2}/docs/changelog.rst +0 -0
- {multiregex-2.0.0 → multiregex-2.0.2}/docs/conf.py +0 -0
- {multiregex-2.0.0 → multiregex-2.0.2}/docs/index.rst +0 -0
- {multiregex-2.0.0 → multiregex-2.0.2}/test_utils/__init__.py +0 -0
- {multiregex-2.0.0 → multiregex-2.0.2}/test_utils/cpython_test_re.py +0 -0
- {multiregex-2.0.0 → multiregex-2.0.2}/tests/conftest.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
blank_issues_enabled: false
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
<!-- ⚠️ This is an open-source repository. Do not share sensitive information. -->
|
|
2
|
+
|
|
1
3
|
<!--
|
|
2
4
|
Thank you for pull request.
|
|
3
5
|
Below are a few things we ask you kindly to self-check before getting a review. Remove checks that are not relevant.
|
|
@@ -5,4 +7,4 @@ Below are a few things we ask you kindly to self-check before getting a review.
|
|
|
5
7
|
|
|
6
8
|
# Checklist
|
|
7
9
|
|
|
8
|
-
|
|
10
|
+
- [ ] Added a `CHANGELOG.rst` entry
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
name: Build
|
|
2
|
+
on:
|
|
3
|
+
pull_request:
|
|
4
|
+
push:
|
|
5
|
+
branches:
|
|
6
|
+
- "main"
|
|
7
|
+
tags:
|
|
8
|
+
- "*"
|
|
9
|
+
|
|
10
|
+
jobs:
|
|
11
|
+
build:
|
|
12
|
+
runs-on: ubuntu-latest
|
|
13
|
+
steps:
|
|
14
|
+
- uses: actions/checkout@v4
|
|
15
|
+
with:
|
|
16
|
+
ref: ${{ github.ref }}
|
|
17
|
+
fetch-depth: 0
|
|
18
|
+
- name: Set up pixi
|
|
19
|
+
uses: prefix-dev/setup-pixi@v0.8.0
|
|
20
|
+
with:
|
|
21
|
+
environments: build
|
|
22
|
+
- name: Build project
|
|
23
|
+
run: pixi run -e build build-wheel
|
|
24
|
+
- name: Upload package
|
|
25
|
+
uses: actions/upload-artifact@v4
|
|
26
|
+
with:
|
|
27
|
+
name: artifact
|
|
28
|
+
path: dist/*
|
|
29
|
+
|
|
30
|
+
release:
|
|
31
|
+
name: Publish package
|
|
32
|
+
if: startsWith(github.ref, 'refs/tags/')
|
|
33
|
+
needs: [build]
|
|
34
|
+
runs-on: ubuntu-latest
|
|
35
|
+
permissions:
|
|
36
|
+
id-token: write
|
|
37
|
+
environment: pypi
|
|
38
|
+
steps:
|
|
39
|
+
- uses: actions/download-artifact@v4
|
|
40
|
+
with:
|
|
41
|
+
name: artifact
|
|
42
|
+
path: dist
|
|
43
|
+
- name: Publish package on PyPi
|
|
44
|
+
uses: pypa/gh-action-pypi-publish@81e9d935c883d0b210363ab89cf05f3894778450
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
on:
|
|
3
|
+
push:
|
|
4
|
+
branches:
|
|
5
|
+
- main
|
|
6
|
+
pull_request:
|
|
7
|
+
|
|
8
|
+
defaults:
|
|
9
|
+
run:
|
|
10
|
+
shell: bash -el {0}
|
|
11
|
+
|
|
12
|
+
jobs:
|
|
13
|
+
unit-tests:
|
|
14
|
+
name: pytest
|
|
15
|
+
timeout-minutes: 10
|
|
16
|
+
runs-on: ${{ matrix.os }}
|
|
17
|
+
strategy:
|
|
18
|
+
fail-fast: false
|
|
19
|
+
matrix:
|
|
20
|
+
include:
|
|
21
|
+
- { os: ubuntu-latest, environment: py38 }
|
|
22
|
+
- { os: ubuntu-latest, environment: py312 }
|
|
23
|
+
- { os: windows-latest, environment: py38 }
|
|
24
|
+
- { os: windows-latest, environment: py312 }
|
|
25
|
+
- { os: macos-latest, environment: py38 }
|
|
26
|
+
- { os: macos-latest, environment: py312 }
|
|
27
|
+
steps:
|
|
28
|
+
- name: Checkout branch
|
|
29
|
+
uses: actions/checkout@v4
|
|
30
|
+
with:
|
|
31
|
+
ref: ${{ github.ref }}
|
|
32
|
+
fetch-depth: 0
|
|
33
|
+
- name: Set up pixi
|
|
34
|
+
uses: prefix-dev/setup-pixi@v0.8.0
|
|
35
|
+
with:
|
|
36
|
+
environments: ${{ matrix.environment }}
|
|
37
|
+
- name: Install repository
|
|
38
|
+
run: pixi run -e ${{ matrix.environment }} postinstall
|
|
39
|
+
- name: Run unittests
|
|
40
|
+
uses: quantco/pytest-action@v2
|
|
41
|
+
with:
|
|
42
|
+
report-title: Unit tests ${{ matrix.environment }}
|
|
43
|
+
custom-pytest: pixi run -e ${{ matrix.environment }} pytest
|
|
44
|
+
|
|
45
|
+
pre-commit-checks:
|
|
46
|
+
name: Pre-commit Checks
|
|
47
|
+
timeout-minutes: 30
|
|
48
|
+
runs-on: ubuntu-latest
|
|
49
|
+
steps:
|
|
50
|
+
- name: Checkout branch
|
|
51
|
+
uses: actions/checkout@v4
|
|
52
|
+
with:
|
|
53
|
+
ref: ${{ github.ref }}
|
|
54
|
+
fetch-depth: 0
|
|
55
|
+
- name: Set up pixi
|
|
56
|
+
uses: prefix-dev/setup-pixi@v0.8.0
|
|
57
|
+
with:
|
|
58
|
+
environments: lint default
|
|
59
|
+
- name: pre-commit
|
|
60
|
+
run: pixi run pre-commit-run --color=always --show-diff-on-failure
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# C extensions
|
|
7
|
+
*.so
|
|
8
|
+
|
|
9
|
+
# Distribution / packaging
|
|
10
|
+
.Python
|
|
11
|
+
env/
|
|
12
|
+
.envrc
|
|
13
|
+
build/
|
|
14
|
+
develop-eggs/
|
|
15
|
+
dist/
|
|
16
|
+
downloads/
|
|
17
|
+
eggs/
|
|
18
|
+
.eggs/
|
|
19
|
+
lib/
|
|
20
|
+
lib64/
|
|
21
|
+
parts/
|
|
22
|
+
sdist/
|
|
23
|
+
var/
|
|
24
|
+
wheels/
|
|
25
|
+
*.egg-info/
|
|
26
|
+
.installed.cfg
|
|
27
|
+
*.egg
|
|
28
|
+
.asv
|
|
29
|
+
pip-wheel-metadata
|
|
30
|
+
|
|
31
|
+
# PyInstaller
|
|
32
|
+
# Usually these files are written by a python script from a template
|
|
33
|
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
34
|
+
*.manifest
|
|
35
|
+
*.spec
|
|
36
|
+
|
|
37
|
+
# Installer logs
|
|
38
|
+
pip-log.txt
|
|
39
|
+
pip-delete-this-directory.txt
|
|
40
|
+
|
|
41
|
+
# Unit test / coverage reports
|
|
42
|
+
htmlcov/
|
|
43
|
+
.tox/
|
|
44
|
+
.coverage
|
|
45
|
+
.coverage.*
|
|
46
|
+
.cache
|
|
47
|
+
nosetests.xml
|
|
48
|
+
coverage.xml
|
|
49
|
+
*.cover
|
|
50
|
+
.hypothesis/
|
|
51
|
+
/.pytest_cache/
|
|
52
|
+
|
|
53
|
+
# Translations
|
|
54
|
+
*.mo
|
|
55
|
+
*.pot
|
|
56
|
+
|
|
57
|
+
# Django stuff:
|
|
58
|
+
*.log
|
|
59
|
+
local_settings.py
|
|
60
|
+
|
|
61
|
+
# Flask stuff:
|
|
62
|
+
instance/
|
|
63
|
+
.webassets-cache
|
|
64
|
+
|
|
65
|
+
# Scrapy stuff:
|
|
66
|
+
.scrapy
|
|
67
|
+
|
|
68
|
+
# Sphinx documentation
|
|
69
|
+
docs/_build/
|
|
70
|
+
docs/api/
|
|
71
|
+
|
|
72
|
+
# PyBuilder
|
|
73
|
+
target/
|
|
74
|
+
|
|
75
|
+
# Jupyter Notebook
|
|
76
|
+
.ipynb_checkpoints
|
|
77
|
+
|
|
78
|
+
# pyenv
|
|
79
|
+
.python-version
|
|
80
|
+
|
|
81
|
+
# celery beat schedule file
|
|
82
|
+
celerybeat-schedule
|
|
83
|
+
|
|
84
|
+
# SageMath parsed files
|
|
85
|
+
*.sage.py
|
|
86
|
+
|
|
87
|
+
# dotenv
|
|
88
|
+
.env
|
|
89
|
+
|
|
90
|
+
# virtualenv
|
|
91
|
+
.venv
|
|
92
|
+
venv/
|
|
93
|
+
ENV/
|
|
94
|
+
|
|
95
|
+
# Spyder project settings
|
|
96
|
+
.spyderproject
|
|
97
|
+
.spyproject
|
|
98
|
+
|
|
99
|
+
# Rope project settings
|
|
100
|
+
.ropeproject
|
|
101
|
+
|
|
102
|
+
# mkdocs documentation
|
|
103
|
+
/site
|
|
104
|
+
|
|
105
|
+
# mypy
|
|
106
|
+
.mypy_cache/
|
|
107
|
+
|
|
108
|
+
# pycharm
|
|
109
|
+
/.idea/
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
# experiments
|
|
113
|
+
private_*
|
|
114
|
+
|
|
115
|
+
# mlflow
|
|
116
|
+
mlruns
|
|
117
|
+
|
|
118
|
+
# vscode
|
|
119
|
+
.vscode
|
|
120
|
+
|
|
121
|
+
# direnv
|
|
122
|
+
.envrc
|
|
123
|
+
# pixi environments
|
|
124
|
+
.pixi
|
|
125
|
+
*.egg-info
|
|
126
|
+
|
|
127
|
+
.ruff_cache
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
repos:
|
|
2
|
+
- repo: local
|
|
3
|
+
hooks:
|
|
4
|
+
# docformatter
|
|
5
|
+
- id: docformatter
|
|
6
|
+
name: docformatter
|
|
7
|
+
entry: pixi run -e lint docformatter -i
|
|
8
|
+
language: system
|
|
9
|
+
types: [python]
|
|
10
|
+
# ruff
|
|
11
|
+
- id: ruff
|
|
12
|
+
name: ruff
|
|
13
|
+
entry: pixi run -e lint ruff check --fix --exit-non-zero-on-fix --force-exclude
|
|
14
|
+
language: system
|
|
15
|
+
types_or: [python, pyi]
|
|
16
|
+
require_serial: true
|
|
17
|
+
- id: ruff-format
|
|
18
|
+
name: ruff-format
|
|
19
|
+
entry: pixi run -e lint ruff format --force-exclude
|
|
20
|
+
language: system
|
|
21
|
+
types_or: [python, pyi]
|
|
22
|
+
require_serial: true
|
|
23
|
+
# prettier
|
|
24
|
+
- id: prettier
|
|
25
|
+
name: prettier
|
|
26
|
+
entry: pixi run -e lint prettier --write --list-different --ignore-unknown
|
|
27
|
+
language: system
|
|
28
|
+
types: [text]
|
|
29
|
+
files: \.(md|yml|yaml)$
|
|
30
|
+
# pre-commit-hooks
|
|
31
|
+
- id: trailing-whitespace-fixer
|
|
32
|
+
name: trailing-whitespace-fixer
|
|
33
|
+
entry: pixi run -e lint trailing-whitespace-fixer
|
|
34
|
+
language: system
|
|
35
|
+
types: [text]
|
|
36
|
+
- id: end-of-file-fixer
|
|
37
|
+
name: end-of-file-fixer
|
|
38
|
+
entry: pixi run -e lint end-of-file-fixer
|
|
39
|
+
language: system
|
|
40
|
+
types: [text]
|
|
41
|
+
- id: check-merge-conflict
|
|
42
|
+
name: check-merge-conflict
|
|
43
|
+
entry: pixi run -e lint check-merge-conflict --assume-in-merge
|
|
44
|
+
language: system
|
|
45
|
+
types: [text]
|
|
46
|
+
# typos
|
|
47
|
+
- id: typos
|
|
48
|
+
name: typos
|
|
49
|
+
entry: pixi run -e lint typos --force-exclude
|
|
50
|
+
language: system
|
|
51
|
+
types: [text]
|
|
52
|
+
require_serial: true
|
|
53
|
+
# mypy
|
|
54
|
+
- id: mypy
|
|
55
|
+
name: mypy
|
|
56
|
+
entry: pixi run -e default mypy
|
|
57
|
+
language: system
|
|
58
|
+
types: [python]
|
|
59
|
+
require_serial: true
|
|
60
|
+
# taplo
|
|
61
|
+
- id: taplo
|
|
62
|
+
name: taplo
|
|
63
|
+
entry: pixi run -e lint taplo format
|
|
64
|
+
language: system
|
|
65
|
+
types: [toml]
|
|
@@ -7,6 +7,15 @@
|
|
|
7
7
|
Changelog
|
|
8
8
|
=========
|
|
9
9
|
|
|
10
|
+
2.0.2 (2024-05-23)
|
|
11
|
+
------------------
|
|
12
|
+
- Included a py.typed file to indicate that the package is fully typed.
|
|
13
|
+
|
|
14
|
+
2.0.1 (2023-06-11)
|
|
15
|
+
------------------
|
|
16
|
+
|
|
17
|
+
- Fix exception when mixing patterns with prematchers and without prematchers.
|
|
18
|
+
|
|
10
19
|
2.0.0 (2023-03-08)
|
|
11
20
|
------------------
|
|
12
21
|
|
|
@@ -1,45 +1,48 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: multiregex
|
|
3
|
-
Version: 2.0.
|
|
4
|
-
Summary:
|
|
5
|
-
Author-email: "QuantCo, Inc." <noreply@quantco.com>
|
|
6
|
-
Maintainer-email:
|
|
7
|
-
|
|
3
|
+
Version: 2.0.2
|
|
4
|
+
Summary: Quickly match many regexes against a string. Provides 2-10x speedups over naïve regex matching.
|
|
5
|
+
Author-email: "QuantCo, Inc." <noreply@quantco.com>, Jonas Haag <jonas@lophus.org>
|
|
6
|
+
Maintainer-email: Bela Stoyan <bela.stoyan@quantco.com>
|
|
7
|
+
Project-URL: Home, https://github.com/quantco/multiregex
|
|
8
8
|
Classifier: Programming Language :: Python :: 3
|
|
9
|
-
Classifier: Programming Language :: Python :: 3.4
|
|
10
|
-
Classifier: Programming Language :: Python :: 3.5
|
|
11
|
-
Classifier: Programming Language :: Python :: 3.6
|
|
12
|
-
Classifier: Programming Language :: Python :: 3.7
|
|
13
9
|
Classifier: Programming Language :: Python :: 3.8
|
|
14
10
|
Classifier: Programming Language :: Python :: 3.9
|
|
15
11
|
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
-
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Requires-Python: >=3.8
|
|
15
|
+
Description-Content-Type: text/markdown
|
|
16
|
+
License-File: LICENSE
|
|
17
|
+
Requires-Dist: pyahocorasick
|
|
17
18
|
|
|
18
19
|
# multiregex
|
|
19
20
|
|
|
20
21
|
[](https://github.com/Quantco/multiregex/actions/workflows/ci.yml)
|
|
21
22
|
[](https://docs.dev.quantco.cloud/qc-github-artifacts/Quantco/multiregex/latest/index.html)
|
|
23
|
+
[](https://anaconda.org/conda-forge/multiregex)
|
|
24
|
+
[](https://pypi.org/project/multiregex)
|
|
25
|
+
[](https://pypi.org/project/multiregex)
|
|
22
26
|
|
|
23
27
|
Quickly match many regexes against a string. Provides 2-10x speedups over naïve regex matching.
|
|
24
28
|
|
|
29
|
+
## Introduction
|
|
30
|
+
|
|
31
|
+
See [this introductory blog post](https://tech.quantco.com/2022/07/31/multiregex.html).
|
|
32
|
+
|
|
25
33
|
## Installation
|
|
26
34
|
|
|
27
35
|
You can install the package in development mode using:
|
|
28
36
|
|
|
29
37
|
```bash
|
|
30
|
-
git clone
|
|
38
|
+
git clone https://github.com/quantco/multiregex
|
|
31
39
|
cd multiregex
|
|
32
40
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
conda activate multiregex
|
|
37
|
-
|
|
38
|
-
pre-commit install
|
|
39
|
-
pip install --no-build-isolation -e .
|
|
41
|
+
pixi run pre-commit-install
|
|
42
|
+
pixi run postinstall
|
|
43
|
+
pixi run test
|
|
40
44
|
```
|
|
41
45
|
|
|
42
|
-
|
|
43
46
|
## Usage
|
|
44
47
|
|
|
45
48
|
```py
|
|
@@ -68,7 +71,7 @@ To be able to quickly match many regexes against a string, `multiregex` uses
|
|
|
68
71
|
at least one can be assumed to be present in the haystack if the corresponding regex matches.
|
|
69
72
|
As an example, a valid prematcher of `r"\w+\.com"` could be `[".com"]` and a valid
|
|
70
73
|
prematcher of `r"(B|b)aNäNa"` could be `["b"]` or `["anäna"]`.
|
|
71
|
-
Note that prematchers must be all-lowercase (in order for
|
|
74
|
+
Note that prematchers must be all-lowercase (in order for `multiregex` to be able to support `re.IGNORECASE`).
|
|
72
75
|
|
|
73
76
|
You will likely have to provide your own prematchers for all but the simplest
|
|
74
77
|
regex patterns:
|
|
@@ -120,4 +123,3 @@ print(matcher.format_prematcher_false_positives())
|
|
|
120
123
|
|
|
121
124
|
In this example, there were 137 input strings that were matched positive by the prematcher but negative by the regex.
|
|
122
125
|
In other words, the prematcher failed to prevent slow regex evaluation in 72% of the cases.
|
|
123
|
-
|
|
@@ -2,27 +2,29 @@
|
|
|
2
2
|
|
|
3
3
|
[](https://github.com/Quantco/multiregex/actions/workflows/ci.yml)
|
|
4
4
|
[](https://docs.dev.quantco.cloud/qc-github-artifacts/Quantco/multiregex/latest/index.html)
|
|
5
|
+
[](https://anaconda.org/conda-forge/multiregex)
|
|
6
|
+
[](https://pypi.org/project/multiregex)
|
|
7
|
+
[](https://pypi.org/project/multiregex)
|
|
5
8
|
|
|
6
9
|
Quickly match many regexes against a string. Provides 2-10x speedups over naïve regex matching.
|
|
7
10
|
|
|
11
|
+
## Introduction
|
|
12
|
+
|
|
13
|
+
See [this introductory blog post](https://tech.quantco.com/2022/07/31/multiregex.html).
|
|
14
|
+
|
|
8
15
|
## Installation
|
|
9
16
|
|
|
10
17
|
You can install the package in development mode using:
|
|
11
18
|
|
|
12
19
|
```bash
|
|
13
|
-
git clone
|
|
20
|
+
git clone https://github.com/quantco/multiregex
|
|
14
21
|
cd multiregex
|
|
15
22
|
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
conda activate multiregex
|
|
20
|
-
|
|
21
|
-
pre-commit install
|
|
22
|
-
pip install --no-build-isolation -e .
|
|
23
|
+
pixi run pre-commit-install
|
|
24
|
+
pixi run postinstall
|
|
25
|
+
pixi run test
|
|
23
26
|
```
|
|
24
27
|
|
|
25
|
-
|
|
26
28
|
## Usage
|
|
27
29
|
|
|
28
30
|
```py
|
|
@@ -51,7 +53,7 @@ To be able to quickly match many regexes against a string, `multiregex` uses
|
|
|
51
53
|
at least one can be assumed to be present in the haystack if the corresponding regex matches.
|
|
52
54
|
As an example, a valid prematcher of `r"\w+\.com"` could be `[".com"]` and a valid
|
|
53
55
|
prematcher of `r"(B|b)aNäNa"` could be `["b"]` or `["anäna"]`.
|
|
54
|
-
Note that prematchers must be all-lowercase (in order for
|
|
56
|
+
Note that prematchers must be all-lowercase (in order for `multiregex` to be able to support `re.IGNORECASE`).
|
|
55
57
|
|
|
56
58
|
You will likely have to provide your own prematchers for all but the simplest
|
|
57
59
|
regex patterns:
|
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
r"""Speed up regex matching with non-regex substring "prematchers", similar to
|
|
1
|
+
r"""Speed up regex matching with non-regex substring "prematchers", similar to
|
|
2
|
+
Bloom filters.
|
|
2
3
|
|
|
3
4
|
For each regex pattern we use a list of simple (non-regex) substring prematchers.
|
|
4
5
|
When evaluating regex patterns on a string, we use the prematchers to restrict
|
|
@@ -19,7 +20,9 @@ automatically generated prematchers.
|
|
|
19
20
|
|
|
20
21
|
import collections
|
|
21
22
|
import functools
|
|
23
|
+
import importlib
|
|
22
24
|
import re
|
|
25
|
+
import warnings
|
|
23
26
|
|
|
24
27
|
try:
|
|
25
28
|
sre_constants = re._constants # type: ignore
|
|
@@ -42,7 +45,12 @@ from typing import (
|
|
|
42
45
|
|
|
43
46
|
import ahocorasick
|
|
44
47
|
|
|
45
|
-
|
|
48
|
+
try:
|
|
49
|
+
__version__ = importlib.metadata.version(__name__)
|
|
50
|
+
except importlib.metadata.PackageNotFoundError as e:
|
|
51
|
+
warnings.warn(f"Could not determine version of {__name__}", stacklevel=1)
|
|
52
|
+
warnings.warn(str(e), stacklevel=1)
|
|
53
|
+
__version__ = "unknown"
|
|
46
54
|
|
|
47
55
|
|
|
48
56
|
V = TypeVar("V")
|
|
@@ -83,10 +91,13 @@ class RegexMatcher:
|
|
|
83
91
|
patterns = self._generate_missing_prematchers(patterns)
|
|
84
92
|
self.patterns = [pattern for pattern, _ in patterns]
|
|
85
93
|
self.prematchers = dict(patterns)
|
|
94
|
+
enumerated_patterns = list(enumerate(patterns))
|
|
86
95
|
self.patterns_without_prematchers = {
|
|
87
|
-
|
|
96
|
+
(idx, pattern)
|
|
97
|
+
for idx, (pattern, prematchers) in enumerated_patterns
|
|
98
|
+
if not prematchers
|
|
88
99
|
}
|
|
89
|
-
self.automaton = self._make_automaton(
|
|
100
|
+
self.automaton = self._make_automaton(enumerated_patterns)
|
|
90
101
|
|
|
91
102
|
self.count_prematcher_false_positives = count_prematcher_false_positives
|
|
92
103
|
if count_prematcher_false_positives:
|
|
@@ -107,9 +118,7 @@ class RegexMatcher:
|
|
|
107
118
|
def safe_set(iterable):
|
|
108
119
|
if isinstance(iterable, str):
|
|
109
120
|
raise TypeError(
|
|
110
|
-
"Refusing to interpret {!r} as a list of patterns, pass a list of strings instead"
|
|
111
|
-
iterable
|
|
112
|
-
)
|
|
121
|
+
f"Refusing to interpret {iterable!r} as a list of patterns, pass a list of strings instead"
|
|
113
122
|
)
|
|
114
123
|
else:
|
|
115
124
|
return set(iterable)
|
|
@@ -130,9 +139,11 @@ class RegexMatcher:
|
|
|
130
139
|
patterns = [
|
|
131
140
|
(
|
|
132
141
|
pattern,
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
142
|
+
(
|
|
143
|
+
self.generate_prematchers(pattern)
|
|
144
|
+
if prematchers is None
|
|
145
|
+
else prematchers
|
|
146
|
+
),
|
|
136
147
|
)
|
|
137
148
|
for pattern, prematchers in patterns
|
|
138
149
|
]
|
|
@@ -142,10 +153,10 @@ class RegexMatcher:
|
|
|
142
153
|
return patterns
|
|
143
154
|
|
|
144
155
|
@staticmethod
|
|
145
|
-
def _make_automaton(
|
|
156
|
+
def _make_automaton(enumerated_patterns):
|
|
146
157
|
"""Create the pyahocorasick automaton."""
|
|
147
158
|
pattern_candidates_by_prematchers = collections.defaultdict(set)
|
|
148
|
-
for pattern_idx, (pattern, prematchers) in
|
|
159
|
+
for pattern_idx, (pattern, prematchers) in enumerated_patterns:
|
|
149
160
|
for prematcher in prematchers:
|
|
150
161
|
# `pattern_idx` is used for keeping patterns in order, see `get_pattern_candidates`.
|
|
151
162
|
pattern_candidates_by_prematchers[prematcher].add(
|
|
@@ -190,10 +201,8 @@ class RegexMatcher:
|
|
|
190
201
|
|
|
191
202
|
"""Alias for ``run(re.search, ...)``."""
|
|
192
203
|
search = functools.partialmethod(run, re.search)
|
|
193
|
-
|
|
194
204
|
"""Alias for ``run(re.match, ...)``."""
|
|
195
205
|
match = functools.partialmethod(run, re.match)
|
|
196
|
-
|
|
197
206
|
"""Alias for ``run(re.fullmatch, ...)``."""
|
|
198
207
|
fullmatch = functools.partialmethod(run, re.fullmatch)
|
|
199
208
|
|
|
@@ -248,18 +257,17 @@ class RegexMatcher:
|
|
|
248
257
|
def validate_prematcher(prematcher: str) -> None:
|
|
249
258
|
if not prematcher or any(map(str.isupper, prematcher)):
|
|
250
259
|
raise ValueError(
|
|
251
|
-
"Prematcher {!r} must be non-empty, all-lowercase, all-ASCII"
|
|
252
|
-
prematcher
|
|
253
|
-
)
|
|
260
|
+
f"Prematcher {prematcher!r} must be non-empty, all-lowercase, all-ASCII"
|
|
254
261
|
)
|
|
255
262
|
|
|
256
263
|
|
|
257
264
|
def generate_prematchers(pattern: Pattern) -> Prematchers:
|
|
258
265
|
"""Generate fallback/default prematchers for the given regex `pattern`.
|
|
259
266
|
|
|
260
|
-
Currently the fallback prematcher is just the set of longest
|
|
261
|
-
in the pattern, eg. "Fast(er)? regex(es| matching)"
|
|
262
|
-
|
|
267
|
+
Currently the fallback prematcher is just the set of longest
|
|
268
|
+
terminal texts in the pattern, eg. "Fast(er)? regex(es| matching)"
|
|
269
|
+
-> {" regex"}. One level of branches with the "|" character is
|
|
270
|
+
supported, ie. "(a|bb|ccc)" -> {"ccc", "a", "bb"}.
|
|
263
271
|
"""
|
|
264
272
|
|
|
265
273
|
def _get_top_level_prematcher(sre_ast):
|
|
@@ -283,7 +291,7 @@ def generate_prematchers(pattern: Pattern) -> Prematchers:
|
|
|
283
291
|
if all(child_prematchers):
|
|
284
292
|
return child_prematchers
|
|
285
293
|
|
|
286
|
-
raise ValueError("Could not generate prematchers for {!r}"
|
|
294
|
+
raise ValueError(f"Could not generate prematchers for {pattern.pattern!r}")
|
|
287
295
|
|
|
288
296
|
|
|
289
297
|
def _simplify_sre_ast(sre_ast):
|
|
@@ -315,7 +323,8 @@ def _sre_find_terminals(sre_ast):
|
|
|
315
323
|
|
|
316
324
|
|
|
317
325
|
def _ahocorasick_make_automaton(words: Dict[str, V]) -> "ahocorasick.Automaton[V]":
|
|
318
|
-
"""Make an ahocorasick automaton from a dictionary of `needle -> value`
|
|
326
|
+
"""Make an ahocorasick automaton from a dictionary of `needle -> value`
|
|
327
|
+
items."""
|
|
319
328
|
automaton = ahocorasick.Automaton() # type: ahocorasick.Automaton[V]
|
|
320
329
|
for word, value in words.items():
|
|
321
330
|
_ahocorasick_ensure_successful(automaton.add_word(word, value))
|
|
@@ -324,6 +333,6 @@ def _ahocorasick_make_automaton(words: Dict[str, V]) -> "ahocorasick.Automaton[V
|
|
|
324
333
|
|
|
325
334
|
|
|
326
335
|
def _ahocorasick_ensure_successful(res):
|
|
327
|
-
"""
|
|
336
|
+
"""Pyahocorasick returns errors as bools."""
|
|
328
337
|
if res is False:
|
|
329
338
|
raise AhocorasickError("Error performing ahocorasick call")
|
|
File without changes
|