multiregex 2.0.1__tar.gz → 2.0.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {multiregex-2.0.1 → multiregex-2.0.2}/.gitattributes +2 -0
- multiregex-2.0.2/.github/CODEOWNERS +2 -0
- multiregex-2.0.2/.github/ISSUE_TEMPLATE/config.yml +1 -0
- multiregex-2.0.2/.github/ISSUE_TEMPLATE/issue-template.md +6 -0
- multiregex-2.0.1/.github/PULL_REQUEST_TEMPLATE.md → multiregex-2.0.2/.github/pull_request_template.md +3 -1
- multiregex-2.0.2/.github/workflows/build.yml +44 -0
- multiregex-2.0.2/.github/workflows/ci.yml +60 -0
- multiregex-2.0.2/.gitignore +127 -0
- multiregex-2.0.2/.pre-commit-config.yaml +65 -0
- {multiregex-2.0.1 → multiregex-2.0.2}/CHANGELOG.rst +5 -1
- {multiregex-2.0.1 → multiregex-2.0.2}/PKG-INFO +19 -19
- {multiregex-2.0.1 → multiregex-2.0.2}/README.md +9 -10
- {multiregex-2.0.1 → multiregex-2.0.2}/docs/make.bat +1 -1
- {multiregex-2.0.1/src → multiregex-2.0.2}/multiregex/__init__.py +25 -19
- multiregex-2.0.2/multiregex/py.typed +0 -0
- multiregex-2.0.2/multiregex.egg-info/PKG-INFO +125 -0
- multiregex-2.0.2/multiregex.egg-info/SOURCES.txt +35 -0
- multiregex-2.0.2/multiregex.egg-info/dependency_links.txt +1 -0
- multiregex-2.0.2/multiregex.egg-info/requires.txt +1 -0
- multiregex-2.0.2/multiregex.egg-info/top_level.txt +1 -0
- multiregex-2.0.2/pixi.lock +4161 -0
- multiregex-2.0.2/pixi.toml +58 -0
- multiregex-2.0.2/pyproject.toml +82 -0
- multiregex-2.0.2/setup.cfg +4 -0
- {multiregex-2.0.1/src → multiregex-2.0.2/stubs}/ahocorasick.pyi +2 -3
- {multiregex-2.0.1 → multiregex-2.0.2}/tests/test_bench.py +1 -1
- {multiregex-2.0.1 → multiregex-2.0.2}/tests/test_cpython_tests.py +1 -1
- multiregex-2.0.1/.flake8 +0 -11
- multiregex-2.0.1/.github/CODEOWNERS +0 -1
- multiregex-2.0.1/.github/workflows/ci.yml +0 -54
- multiregex-2.0.1/.pre-commit-config.yaml +0 -28
- multiregex-2.0.1/environment.yml +0 -22
- multiregex-2.0.1/pyproject.toml +0 -59
- multiregex-2.0.1/setup.cfg +0 -30
- multiregex-2.0.1/setup.py +0 -3
- {multiregex-2.0.1 → multiregex-2.0.2}/.github/dependabot.yml +0 -0
- {multiregex-2.0.1 → multiregex-2.0.2}/LICENSE +0 -0
- {multiregex-2.0.1 → multiregex-2.0.2}/docs/Makefile +0 -0
- {multiregex-2.0.1 → multiregex-2.0.2}/docs/changelog.rst +0 -0
- {multiregex-2.0.1 → multiregex-2.0.2}/docs/conf.py +0 -0
- {multiregex-2.0.1 → multiregex-2.0.2}/docs/index.rst +0 -0
- {multiregex-2.0.1 → multiregex-2.0.2}/test_utils/__init__.py +0 -0
- {multiregex-2.0.1 → multiregex-2.0.2}/test_utils/cpython_test_re.py +0 -0
- {multiregex-2.0.1 → multiregex-2.0.2}/tests/conftest.py +0 -0
- {multiregex-2.0.1 → multiregex-2.0.2}/tests/test_multiregex.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
blank_issues_enabled: false
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
<!-- ⚠️ This is an open-source repository. Do not share sensitive information. -->
|
|
2
|
+
|
|
1
3
|
<!--
|
|
2
4
|
Thank you for pull request.
|
|
3
5
|
Below are a few things we ask you kindly to self-check before getting a review. Remove checks that are not relevant.
|
|
@@ -5,4 +7,4 @@ Below are a few things we ask you kindly to self-check before getting a review.
|
|
|
5
7
|
|
|
6
8
|
# Checklist
|
|
7
9
|
|
|
8
|
-
|
|
10
|
+
- [ ] Added a `CHANGELOG.rst` entry
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
name: Build
|
|
2
|
+
on:
|
|
3
|
+
pull_request:
|
|
4
|
+
push:
|
|
5
|
+
branches:
|
|
6
|
+
- "main"
|
|
7
|
+
tags:
|
|
8
|
+
- "*"
|
|
9
|
+
|
|
10
|
+
jobs:
|
|
11
|
+
build:
|
|
12
|
+
runs-on: ubuntu-latest
|
|
13
|
+
steps:
|
|
14
|
+
- uses: actions/checkout@v4
|
|
15
|
+
with:
|
|
16
|
+
ref: ${{ github.ref }}
|
|
17
|
+
fetch-depth: 0
|
|
18
|
+
- name: Set up pixi
|
|
19
|
+
uses: prefix-dev/setup-pixi@v0.8.0
|
|
20
|
+
with:
|
|
21
|
+
environments: build
|
|
22
|
+
- name: Build project
|
|
23
|
+
run: pixi run -e build build-wheel
|
|
24
|
+
- name: Upload package
|
|
25
|
+
uses: actions/upload-artifact@v4
|
|
26
|
+
with:
|
|
27
|
+
name: artifact
|
|
28
|
+
path: dist/*
|
|
29
|
+
|
|
30
|
+
release:
|
|
31
|
+
name: Publish package
|
|
32
|
+
if: startsWith(github.ref, 'refs/tags/')
|
|
33
|
+
needs: [build]
|
|
34
|
+
runs-on: ubuntu-latest
|
|
35
|
+
permissions:
|
|
36
|
+
id-token: write
|
|
37
|
+
environment: pypi
|
|
38
|
+
steps:
|
|
39
|
+
- uses: actions/download-artifact@v4
|
|
40
|
+
with:
|
|
41
|
+
name: artifact
|
|
42
|
+
path: dist
|
|
43
|
+
- name: Publish package on PyPi
|
|
44
|
+
uses: pypa/gh-action-pypi-publish@81e9d935c883d0b210363ab89cf05f3894778450
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
on:
|
|
3
|
+
push:
|
|
4
|
+
branches:
|
|
5
|
+
- main
|
|
6
|
+
pull_request:
|
|
7
|
+
|
|
8
|
+
defaults:
|
|
9
|
+
run:
|
|
10
|
+
shell: bash -el {0}
|
|
11
|
+
|
|
12
|
+
jobs:
|
|
13
|
+
unit-tests:
|
|
14
|
+
name: pytest
|
|
15
|
+
timeout-minutes: 10
|
|
16
|
+
runs-on: ${{ matrix.os }}
|
|
17
|
+
strategy:
|
|
18
|
+
fail-fast: false
|
|
19
|
+
matrix:
|
|
20
|
+
include:
|
|
21
|
+
- { os: ubuntu-latest, environment: py38 }
|
|
22
|
+
- { os: ubuntu-latest, environment: py312 }
|
|
23
|
+
- { os: windows-latest, environment: py38 }
|
|
24
|
+
- { os: windows-latest, environment: py312 }
|
|
25
|
+
- { os: macos-latest, environment: py38 }
|
|
26
|
+
- { os: macos-latest, environment: py312 }
|
|
27
|
+
steps:
|
|
28
|
+
- name: Checkout branch
|
|
29
|
+
uses: actions/checkout@v4
|
|
30
|
+
with:
|
|
31
|
+
ref: ${{ github.ref }}
|
|
32
|
+
fetch-depth: 0
|
|
33
|
+
- name: Set up pixi
|
|
34
|
+
uses: prefix-dev/setup-pixi@v0.8.0
|
|
35
|
+
with:
|
|
36
|
+
environments: ${{ matrix.environment }}
|
|
37
|
+
- name: Install repository
|
|
38
|
+
run: pixi run -e ${{ matrix.environment }} postinstall
|
|
39
|
+
- name: Run unittests
|
|
40
|
+
uses: quantco/pytest-action@v2
|
|
41
|
+
with:
|
|
42
|
+
report-title: Unit tests ${{ matrix.environment }}
|
|
43
|
+
custom-pytest: pixi run -e ${{ matrix.environment }} pytest
|
|
44
|
+
|
|
45
|
+
pre-commit-checks:
|
|
46
|
+
name: Pre-commit Checks
|
|
47
|
+
timeout-minutes: 30
|
|
48
|
+
runs-on: ubuntu-latest
|
|
49
|
+
steps:
|
|
50
|
+
- name: Checkout branch
|
|
51
|
+
uses: actions/checkout@v4
|
|
52
|
+
with:
|
|
53
|
+
ref: ${{ github.ref }}
|
|
54
|
+
fetch-depth: 0
|
|
55
|
+
- name: Set up pixi
|
|
56
|
+
uses: prefix-dev/setup-pixi@v0.8.0
|
|
57
|
+
with:
|
|
58
|
+
environments: lint default
|
|
59
|
+
- name: pre-commit
|
|
60
|
+
run: pixi run pre-commit-run --color=always --show-diff-on-failure
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# C extensions
|
|
7
|
+
*.so
|
|
8
|
+
|
|
9
|
+
# Distribution / packaging
|
|
10
|
+
.Python
|
|
11
|
+
env/
|
|
12
|
+
.envrc
|
|
13
|
+
build/
|
|
14
|
+
develop-eggs/
|
|
15
|
+
dist/
|
|
16
|
+
downloads/
|
|
17
|
+
eggs/
|
|
18
|
+
.eggs/
|
|
19
|
+
lib/
|
|
20
|
+
lib64/
|
|
21
|
+
parts/
|
|
22
|
+
sdist/
|
|
23
|
+
var/
|
|
24
|
+
wheels/
|
|
25
|
+
*.egg-info/
|
|
26
|
+
.installed.cfg
|
|
27
|
+
*.egg
|
|
28
|
+
.asv
|
|
29
|
+
pip-wheel-metadata
|
|
30
|
+
|
|
31
|
+
# PyInstaller
|
|
32
|
+
# Usually these files are written by a python script from a template
|
|
33
|
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
34
|
+
*.manifest
|
|
35
|
+
*.spec
|
|
36
|
+
|
|
37
|
+
# Installer logs
|
|
38
|
+
pip-log.txt
|
|
39
|
+
pip-delete-this-directory.txt
|
|
40
|
+
|
|
41
|
+
# Unit test / coverage reports
|
|
42
|
+
htmlcov/
|
|
43
|
+
.tox/
|
|
44
|
+
.coverage
|
|
45
|
+
.coverage.*
|
|
46
|
+
.cache
|
|
47
|
+
nosetests.xml
|
|
48
|
+
coverage.xml
|
|
49
|
+
*.cover
|
|
50
|
+
.hypothesis/
|
|
51
|
+
/.pytest_cache/
|
|
52
|
+
|
|
53
|
+
# Translations
|
|
54
|
+
*.mo
|
|
55
|
+
*.pot
|
|
56
|
+
|
|
57
|
+
# Django stuff:
|
|
58
|
+
*.log
|
|
59
|
+
local_settings.py
|
|
60
|
+
|
|
61
|
+
# Flask stuff:
|
|
62
|
+
instance/
|
|
63
|
+
.webassets-cache
|
|
64
|
+
|
|
65
|
+
# Scrapy stuff:
|
|
66
|
+
.scrapy
|
|
67
|
+
|
|
68
|
+
# Sphinx documentation
|
|
69
|
+
docs/_build/
|
|
70
|
+
docs/api/
|
|
71
|
+
|
|
72
|
+
# PyBuilder
|
|
73
|
+
target/
|
|
74
|
+
|
|
75
|
+
# Jupyter Notebook
|
|
76
|
+
.ipynb_checkpoints
|
|
77
|
+
|
|
78
|
+
# pyenv
|
|
79
|
+
.python-version
|
|
80
|
+
|
|
81
|
+
# celery beat schedule file
|
|
82
|
+
celerybeat-schedule
|
|
83
|
+
|
|
84
|
+
# SageMath parsed files
|
|
85
|
+
*.sage.py
|
|
86
|
+
|
|
87
|
+
# dotenv
|
|
88
|
+
.env
|
|
89
|
+
|
|
90
|
+
# virtualenv
|
|
91
|
+
.venv
|
|
92
|
+
venv/
|
|
93
|
+
ENV/
|
|
94
|
+
|
|
95
|
+
# Spyder project settings
|
|
96
|
+
.spyderproject
|
|
97
|
+
.spyproject
|
|
98
|
+
|
|
99
|
+
# Rope project settings
|
|
100
|
+
.ropeproject
|
|
101
|
+
|
|
102
|
+
# mkdocs documentation
|
|
103
|
+
/site
|
|
104
|
+
|
|
105
|
+
# mypy
|
|
106
|
+
.mypy_cache/
|
|
107
|
+
|
|
108
|
+
# pycharm
|
|
109
|
+
/.idea/
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
# experiments
|
|
113
|
+
private_*
|
|
114
|
+
|
|
115
|
+
# mlflow
|
|
116
|
+
mlruns
|
|
117
|
+
|
|
118
|
+
# vscode
|
|
119
|
+
.vscode
|
|
120
|
+
|
|
121
|
+
# direnv
|
|
122
|
+
.envrc
|
|
123
|
+
# pixi environments
|
|
124
|
+
.pixi
|
|
125
|
+
*.egg-info
|
|
126
|
+
|
|
127
|
+
.ruff_cache
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
repos:
|
|
2
|
+
- repo: local
|
|
3
|
+
hooks:
|
|
4
|
+
# docformatter
|
|
5
|
+
- id: docformatter
|
|
6
|
+
name: docformatter
|
|
7
|
+
entry: pixi run -e lint docformatter -i
|
|
8
|
+
language: system
|
|
9
|
+
types: [python]
|
|
10
|
+
# ruff
|
|
11
|
+
- id: ruff
|
|
12
|
+
name: ruff
|
|
13
|
+
entry: pixi run -e lint ruff check --fix --exit-non-zero-on-fix --force-exclude
|
|
14
|
+
language: system
|
|
15
|
+
types_or: [python, pyi]
|
|
16
|
+
require_serial: true
|
|
17
|
+
- id: ruff-format
|
|
18
|
+
name: ruff-format
|
|
19
|
+
entry: pixi run -e lint ruff format --force-exclude
|
|
20
|
+
language: system
|
|
21
|
+
types_or: [python, pyi]
|
|
22
|
+
require_serial: true
|
|
23
|
+
# prettier
|
|
24
|
+
- id: prettier
|
|
25
|
+
name: prettier
|
|
26
|
+
entry: pixi run -e lint prettier --write --list-different --ignore-unknown
|
|
27
|
+
language: system
|
|
28
|
+
types: [text]
|
|
29
|
+
files: \.(md|yml|yaml)$
|
|
30
|
+
# pre-commit-hooks
|
|
31
|
+
- id: trailing-whitespace-fixer
|
|
32
|
+
name: trailing-whitespace-fixer
|
|
33
|
+
entry: pixi run -e lint trailing-whitespace-fixer
|
|
34
|
+
language: system
|
|
35
|
+
types: [text]
|
|
36
|
+
- id: end-of-file-fixer
|
|
37
|
+
name: end-of-file-fixer
|
|
38
|
+
entry: pixi run -e lint end-of-file-fixer
|
|
39
|
+
language: system
|
|
40
|
+
types: [text]
|
|
41
|
+
- id: check-merge-conflict
|
|
42
|
+
name: check-merge-conflict
|
|
43
|
+
entry: pixi run -e lint check-merge-conflict --assume-in-merge
|
|
44
|
+
language: system
|
|
45
|
+
types: [text]
|
|
46
|
+
# typos
|
|
47
|
+
- id: typos
|
|
48
|
+
name: typos
|
|
49
|
+
entry: pixi run -e lint typos --force-exclude
|
|
50
|
+
language: system
|
|
51
|
+
types: [text]
|
|
52
|
+
require_serial: true
|
|
53
|
+
# mypy
|
|
54
|
+
- id: mypy
|
|
55
|
+
name: mypy
|
|
56
|
+
entry: pixi run -e default mypy
|
|
57
|
+
language: system
|
|
58
|
+
types: [python]
|
|
59
|
+
require_serial: true
|
|
60
|
+
# taplo
|
|
61
|
+
- id: taplo
|
|
62
|
+
name: taplo
|
|
63
|
+
entry: pixi run -e lint taplo format
|
|
64
|
+
language: system
|
|
65
|
+
types: [toml]
|
|
@@ -7,7 +7,11 @@
|
|
|
7
7
|
Changelog
|
|
8
8
|
=========
|
|
9
9
|
|
|
10
|
-
2.0.
|
|
10
|
+
2.0.2 (2024-05-23)
|
|
11
|
+
------------------
|
|
12
|
+
- Included a py.typed file to indicate that the package is fully typed.
|
|
13
|
+
|
|
14
|
+
2.0.1 (2023-06-11)
|
|
11
15
|
------------------
|
|
12
16
|
|
|
13
17
|
- Fix exception when mixing patterns with prematchers and without prematchers.
|
|
@@ -1,18 +1,20 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: multiregex
|
|
3
|
-
Version: 2.0.
|
|
4
|
-
Summary:
|
|
5
|
-
Author-email: "QuantCo, Inc." <noreply@quantco.com>
|
|
6
|
-
Maintainer-email:
|
|
7
|
-
|
|
3
|
+
Version: 2.0.2
|
|
4
|
+
Summary: Quickly match many regexes against a string. Provides 2-10x speedups over naïve regex matching.
|
|
5
|
+
Author-email: "QuantCo, Inc." <noreply@quantco.com>, Jonas Haag <jonas@lophus.org>
|
|
6
|
+
Maintainer-email: Bela Stoyan <bela.stoyan@quantco.com>
|
|
7
|
+
Project-URL: Home, https://github.com/quantco/multiregex
|
|
8
8
|
Classifier: Programming Language :: Python :: 3
|
|
9
|
-
Classifier: Programming Language :: Python :: 3.6
|
|
10
|
-
Classifier: Programming Language :: Python :: 3.7
|
|
11
9
|
Classifier: Programming Language :: Python :: 3.8
|
|
12
10
|
Classifier: Programming Language :: Python :: 3.9
|
|
13
11
|
Classifier: Programming Language :: Python :: 3.10
|
|
14
12
|
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
-
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Requires-Python: >=3.8
|
|
15
|
+
Description-Content-Type: text/markdown
|
|
16
|
+
License-File: LICENSE
|
|
17
|
+
Requires-Dist: pyahocorasick
|
|
16
18
|
|
|
17
19
|
# multiregex
|
|
18
20
|
|
|
@@ -24,24 +26,23 @@ Project-URL: Home, https://github.com/quantco/multiregex
|
|
|
24
26
|
|
|
25
27
|
Quickly match many regexes against a string. Provides 2-10x speedups over naïve regex matching.
|
|
26
28
|
|
|
29
|
+
## Introduction
|
|
30
|
+
|
|
31
|
+
See [this introductory blog post](https://tech.quantco.com/2022/07/31/multiregex.html).
|
|
32
|
+
|
|
27
33
|
## Installation
|
|
28
34
|
|
|
29
35
|
You can install the package in development mode using:
|
|
30
36
|
|
|
31
37
|
```bash
|
|
32
|
-
git clone
|
|
38
|
+
git clone https://github.com/quantco/multiregex
|
|
33
39
|
cd multiregex
|
|
34
40
|
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
conda activate multiregex
|
|
39
|
-
|
|
40
|
-
pre-commit install
|
|
41
|
-
pip install --no-build-isolation -e .
|
|
41
|
+
pixi run pre-commit-install
|
|
42
|
+
pixi run postinstall
|
|
43
|
+
pixi run test
|
|
42
44
|
```
|
|
43
45
|
|
|
44
|
-
|
|
45
46
|
## Usage
|
|
46
47
|
|
|
47
48
|
```py
|
|
@@ -70,7 +71,7 @@ To be able to quickly match many regexes against a string, `multiregex` uses
|
|
|
70
71
|
at least one can be assumed to be present in the haystack if the corresponding regex matches.
|
|
71
72
|
As an example, a valid prematcher of `r"\w+\.com"` could be `[".com"]` and a valid
|
|
72
73
|
prematcher of `r"(B|b)aNäNa"` could be `["b"]` or `["anäna"]`.
|
|
73
|
-
Note that prematchers must be all-lowercase (in order for
|
|
74
|
+
Note that prematchers must be all-lowercase (in order for `multiregex` to be able to support `re.IGNORECASE`).
|
|
74
75
|
|
|
75
76
|
You will likely have to provide your own prematchers for all but the simplest
|
|
76
77
|
regex patterns:
|
|
@@ -122,4 +123,3 @@ print(matcher.format_prematcher_false_positives())
|
|
|
122
123
|
|
|
123
124
|
In this example, there were 137 input strings that were matched positive by the prematcher but negative by the regex.
|
|
124
125
|
In other words, the prematcher failed to prevent slow regex evaluation in 72% of the cases.
|
|
125
|
-
|
|
@@ -8,24 +8,23 @@
|
|
|
8
8
|
|
|
9
9
|
Quickly match many regexes against a string. Provides 2-10x speedups over naïve regex matching.
|
|
10
10
|
|
|
11
|
+
## Introduction
|
|
12
|
+
|
|
13
|
+
See [this introductory blog post](https://tech.quantco.com/2022/07/31/multiregex.html).
|
|
14
|
+
|
|
11
15
|
## Installation
|
|
12
16
|
|
|
13
17
|
You can install the package in development mode using:
|
|
14
18
|
|
|
15
19
|
```bash
|
|
16
|
-
git clone
|
|
20
|
+
git clone https://github.com/quantco/multiregex
|
|
17
21
|
cd multiregex
|
|
18
22
|
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
conda activate multiregex
|
|
23
|
-
|
|
24
|
-
pre-commit install
|
|
25
|
-
pip install --no-build-isolation -e .
|
|
23
|
+
pixi run pre-commit-install
|
|
24
|
+
pixi run postinstall
|
|
25
|
+
pixi run test
|
|
26
26
|
```
|
|
27
27
|
|
|
28
|
-
|
|
29
28
|
## Usage
|
|
30
29
|
|
|
31
30
|
```py
|
|
@@ -54,7 +53,7 @@ To be able to quickly match many regexes against a string, `multiregex` uses
|
|
|
54
53
|
at least one can be assumed to be present in the haystack if the corresponding regex matches.
|
|
55
54
|
As an example, a valid prematcher of `r"\w+\.com"` could be `[".com"]` and a valid
|
|
56
55
|
prematcher of `r"(B|b)aNäNa"` could be `["b"]` or `["anäna"]`.
|
|
57
|
-
Note that prematchers must be all-lowercase (in order for
|
|
56
|
+
Note that prematchers must be all-lowercase (in order for `multiregex` to be able to support `re.IGNORECASE`).
|
|
58
57
|
|
|
59
58
|
You will likely have to provide your own prematchers for all but the simplest
|
|
60
59
|
regex patterns:
|
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
r"""Speed up regex matching with non-regex substring "prematchers", similar to
|
|
1
|
+
r"""Speed up regex matching with non-regex substring "prematchers", similar to
|
|
2
|
+
Bloom filters.
|
|
2
3
|
|
|
3
4
|
For each regex pattern we use a list of simple (non-regex) substring prematchers.
|
|
4
5
|
When evaluating regex patterns on a string, we use the prematchers to restrict
|
|
@@ -19,7 +20,9 @@ automatically generated prematchers.
|
|
|
19
20
|
|
|
20
21
|
import collections
|
|
21
22
|
import functools
|
|
23
|
+
import importlib
|
|
22
24
|
import re
|
|
25
|
+
import warnings
|
|
23
26
|
|
|
24
27
|
try:
|
|
25
28
|
sre_constants = re._constants # type: ignore
|
|
@@ -42,7 +45,12 @@ from typing import (
|
|
|
42
45
|
|
|
43
46
|
import ahocorasick
|
|
44
47
|
|
|
45
|
-
|
|
48
|
+
try:
|
|
49
|
+
__version__ = importlib.metadata.version(__name__)
|
|
50
|
+
except importlib.metadata.PackageNotFoundError as e:
|
|
51
|
+
warnings.warn(f"Could not determine version of {__name__}", stacklevel=1)
|
|
52
|
+
warnings.warn(str(e), stacklevel=1)
|
|
53
|
+
__version__ = "unknown"
|
|
46
54
|
|
|
47
55
|
|
|
48
56
|
V = TypeVar("V")
|
|
@@ -110,9 +118,7 @@ class RegexMatcher:
|
|
|
110
118
|
def safe_set(iterable):
|
|
111
119
|
if isinstance(iterable, str):
|
|
112
120
|
raise TypeError(
|
|
113
|
-
"Refusing to interpret {!r} as a list of patterns, pass a list of strings instead"
|
|
114
|
-
iterable
|
|
115
|
-
)
|
|
121
|
+
f"Refusing to interpret {iterable!r} as a list of patterns, pass a list of strings instead"
|
|
116
122
|
)
|
|
117
123
|
else:
|
|
118
124
|
return set(iterable)
|
|
@@ -133,9 +139,11 @@ class RegexMatcher:
|
|
|
133
139
|
patterns = [
|
|
134
140
|
(
|
|
135
141
|
pattern,
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
142
|
+
(
|
|
143
|
+
self.generate_prematchers(pattern)
|
|
144
|
+
if prematchers is None
|
|
145
|
+
else prematchers
|
|
146
|
+
),
|
|
139
147
|
)
|
|
140
148
|
for pattern, prematchers in patterns
|
|
141
149
|
]
|
|
@@ -193,10 +201,8 @@ class RegexMatcher:
|
|
|
193
201
|
|
|
194
202
|
"""Alias for ``run(re.search, ...)``."""
|
|
195
203
|
search = functools.partialmethod(run, re.search)
|
|
196
|
-
|
|
197
204
|
"""Alias for ``run(re.match, ...)``."""
|
|
198
205
|
match = functools.partialmethod(run, re.match)
|
|
199
|
-
|
|
200
206
|
"""Alias for ``run(re.fullmatch, ...)``."""
|
|
201
207
|
fullmatch = functools.partialmethod(run, re.fullmatch)
|
|
202
208
|
|
|
@@ -251,18 +257,17 @@ class RegexMatcher:
|
|
|
251
257
|
def validate_prematcher(prematcher: str) -> None:
|
|
252
258
|
if not prematcher or any(map(str.isupper, prematcher)):
|
|
253
259
|
raise ValueError(
|
|
254
|
-
"Prematcher {!r} must be non-empty, all-lowercase, all-ASCII"
|
|
255
|
-
prematcher
|
|
256
|
-
)
|
|
260
|
+
f"Prematcher {prematcher!r} must be non-empty, all-lowercase, all-ASCII"
|
|
257
261
|
)
|
|
258
262
|
|
|
259
263
|
|
|
260
264
|
def generate_prematchers(pattern: Pattern) -> Prematchers:
|
|
261
265
|
"""Generate fallback/default prematchers for the given regex `pattern`.
|
|
262
266
|
|
|
263
|
-
Currently the fallback prematcher is just the set of longest
|
|
264
|
-
in the pattern, eg. "Fast(er)? regex(es| matching)"
|
|
265
|
-
|
|
267
|
+
Currently the fallback prematcher is just the set of longest
|
|
268
|
+
terminal texts in the pattern, eg. "Fast(er)? regex(es| matching)"
|
|
269
|
+
-> {" regex"}. One level of branches with the "|" character is
|
|
270
|
+
supported, ie. "(a|bb|ccc)" -> {"ccc", "a", "bb"}.
|
|
266
271
|
"""
|
|
267
272
|
|
|
268
273
|
def _get_top_level_prematcher(sre_ast):
|
|
@@ -286,7 +291,7 @@ def generate_prematchers(pattern: Pattern) -> Prematchers:
|
|
|
286
291
|
if all(child_prematchers):
|
|
287
292
|
return child_prematchers
|
|
288
293
|
|
|
289
|
-
raise ValueError("Could not generate prematchers for {!r}"
|
|
294
|
+
raise ValueError(f"Could not generate prematchers for {pattern.pattern!r}")
|
|
290
295
|
|
|
291
296
|
|
|
292
297
|
def _simplify_sre_ast(sre_ast):
|
|
@@ -318,7 +323,8 @@ def _sre_find_terminals(sre_ast):
|
|
|
318
323
|
|
|
319
324
|
|
|
320
325
|
def _ahocorasick_make_automaton(words: Dict[str, V]) -> "ahocorasick.Automaton[V]":
|
|
321
|
-
"""Make an ahocorasick automaton from a dictionary of `needle -> value`
|
|
326
|
+
"""Make an ahocorasick automaton from a dictionary of `needle -> value`
|
|
327
|
+
items."""
|
|
322
328
|
automaton = ahocorasick.Automaton() # type: ahocorasick.Automaton[V]
|
|
323
329
|
for word, value in words.items():
|
|
324
330
|
_ahocorasick_ensure_successful(automaton.add_word(word, value))
|
|
@@ -327,6 +333,6 @@ def _ahocorasick_make_automaton(words: Dict[str, V]) -> "ahocorasick.Automaton[V
|
|
|
327
333
|
|
|
328
334
|
|
|
329
335
|
def _ahocorasick_ensure_successful(res):
|
|
330
|
-
"""
|
|
336
|
+
"""Pyahocorasick returns errors as bools."""
|
|
331
337
|
if res is False:
|
|
332
338
|
raise AhocorasickError("Error performing ahocorasick call")
|
|
File without changes
|