bnc-lookup 1.3.0__py3-none-any.whl → 1.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bnc_lookup/find_bnc.py +8 -3
- bnc_lookup/find_freq.py +8 -3
- bnc_lookup/find_rf.py +8 -3
- bnc_lookup/normalize.py +68 -0
- {bnc_lookup-1.3.0.dist-info → bnc_lookup-1.3.2.dist-info}/METADATA +2 -1
- {bnc_lookup-1.3.0.dist-info → bnc_lookup-1.3.2.dist-info}/RECORD +9 -8
- {bnc_lookup-1.3.0.dist-info → bnc_lookup-1.3.2.dist-info}/LICENSE +0 -0
- {bnc_lookup-1.3.0.dist-info → bnc_lookup-1.3.2.dist-info}/WHEEL +0 -0
- {bnc_lookup-1.3.0.dist-info → bnc_lookup-1.3.2.dist-info}/entry_points.txt +0 -0
bnc_lookup/find_bnc.py
CHANGED
|
@@ -14,6 +14,8 @@ the singular form (with trailing 's' removed) is also checked.
|
|
|
14
14
|
import hashlib
|
|
15
15
|
import importlib
|
|
16
16
|
|
|
17
|
+
from bnc_lookup.normalize import normalize
|
|
18
|
+
|
|
17
19
|
_cache = {}
|
|
18
20
|
|
|
19
21
|
|
|
@@ -33,7 +35,10 @@ def _get_hash_set(prefix: str) -> frozenset:
|
|
|
33
35
|
|
|
34
36
|
|
|
35
37
|
def _calculate_md5(input_text: str) -> str:
|
|
36
|
-
"""Compute the MD5 hex digest of a normalized
|
|
38
|
+
"""Compute the MD5 hex digest of a normalized word.
|
|
39
|
+
|
|
40
|
+
Normalization includes apostrophe variant conversion, lowercase,
|
|
41
|
+
and whitespace stripping.
|
|
37
42
|
|
|
38
43
|
Args:
|
|
39
44
|
input_text: The word to hash.
|
|
@@ -41,7 +46,7 @@ def _calculate_md5(input_text: str) -> str:
|
|
|
41
46
|
Returns:
|
|
42
47
|
32-character hexadecimal MD5 digest string.
|
|
43
48
|
"""
|
|
44
|
-
return hashlib.md5(input_text
|
|
49
|
+
return hashlib.md5(normalize(input_text).encode()).hexdigest()
|
|
45
50
|
|
|
46
51
|
|
|
47
52
|
def _hash_exists(input_text: str) -> bool:
|
|
@@ -90,7 +95,7 @@ class FindBnc:
|
|
|
90
95
|
Returns:
|
|
91
96
|
True if the word (or its singular form) exists in the BNC.
|
|
92
97
|
"""
|
|
93
|
-
input_text = input_text
|
|
98
|
+
input_text = normalize(input_text)
|
|
94
99
|
|
|
95
100
|
if _hash_exists(input_text):
|
|
96
101
|
return True
|
bnc_lookup/find_freq.py
CHANGED
|
@@ -14,6 +14,8 @@ bucket files (f_00.py through f_ff.py) maps hash suffixes to bucket numbers.
|
|
|
14
14
|
import hashlib
|
|
15
15
|
import importlib
|
|
16
16
|
|
|
17
|
+
from bnc_lookup.normalize import normalize
|
|
18
|
+
|
|
17
19
|
_cache = {}
|
|
18
20
|
|
|
19
21
|
|
|
@@ -33,7 +35,10 @@ def _get_bucket_dict(prefix: str) -> dict:
|
|
|
33
35
|
|
|
34
36
|
|
|
35
37
|
def _calculate_md5(input_text: str) -> str:
|
|
36
|
-
"""Compute the MD5 hex digest of a normalized
|
|
38
|
+
"""Compute the MD5 hex digest of a normalized word.
|
|
39
|
+
|
|
40
|
+
Normalization includes apostrophe variant conversion, lowercase,
|
|
41
|
+
and whitespace stripping.
|
|
37
42
|
|
|
38
43
|
Args:
|
|
39
44
|
input_text: The word to hash.
|
|
@@ -41,7 +46,7 @@ def _calculate_md5(input_text: str) -> str:
|
|
|
41
46
|
Returns:
|
|
42
47
|
32-character hexadecimal MD5 digest string.
|
|
43
48
|
"""
|
|
44
|
-
return hashlib.md5(input_text
|
|
49
|
+
return hashlib.md5(normalize(input_text).encode()).hexdigest()
|
|
45
50
|
|
|
46
51
|
|
|
47
52
|
def _lookup_bucket(input_text: str) -> int | None:
|
|
@@ -89,7 +94,7 @@ class FindFreq:
|
|
|
89
94
|
1-100: Bucket number (1=most frequent, 100=least frequent)
|
|
90
95
|
None: Word not found in BNC
|
|
91
96
|
"""
|
|
92
|
-
input_text = input_text
|
|
97
|
+
input_text = normalize(input_text)
|
|
93
98
|
|
|
94
99
|
result = _lookup_bucket(input_text)
|
|
95
100
|
if result is not None:
|
bnc_lookup/find_rf.py
CHANGED
|
@@ -17,6 +17,8 @@ through rf_ff.py) maps hash suffixes to relative frequency values.
|
|
|
17
17
|
import hashlib
|
|
18
18
|
import importlib
|
|
19
19
|
|
|
20
|
+
from bnc_lookup.normalize import normalize
|
|
21
|
+
|
|
20
22
|
_cache = {}
|
|
21
23
|
|
|
22
24
|
|
|
@@ -36,7 +38,10 @@ def _get_rf_dict(prefix: str) -> dict:
|
|
|
36
38
|
|
|
37
39
|
|
|
38
40
|
def _calculate_md5(input_text: str) -> str:
|
|
39
|
-
"""Compute the MD5 hex digest of a normalized
|
|
41
|
+
"""Compute the MD5 hex digest of a normalized word.
|
|
42
|
+
|
|
43
|
+
Normalization includes apostrophe variant conversion, lowercase,
|
|
44
|
+
and whitespace stripping.
|
|
40
45
|
|
|
41
46
|
Args:
|
|
42
47
|
input_text: The word to hash.
|
|
@@ -44,7 +49,7 @@ def _calculate_md5(input_text: str) -> str:
|
|
|
44
49
|
Returns:
|
|
45
50
|
32-character hexadecimal MD5 digest string.
|
|
46
51
|
"""
|
|
47
|
-
return hashlib.md5(input_text
|
|
52
|
+
return hashlib.md5(normalize(input_text).encode()).hexdigest()
|
|
48
53
|
|
|
49
54
|
|
|
50
55
|
def _lookup_rf(input_text: str) -> float | None:
|
|
@@ -91,7 +96,7 @@ class FindRF:
|
|
|
91
96
|
Returns:
|
|
92
97
|
Float in range (0, 1), or None if word not in BNC.
|
|
93
98
|
"""
|
|
94
|
-
input_text = input_text
|
|
99
|
+
input_text = normalize(input_text)
|
|
95
100
|
|
|
96
101
|
result = _lookup_rf(input_text)
|
|
97
102
|
if result is not None:
|
bnc_lookup/normalize.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
# !/usr/bin/env python
|
|
2
|
+
# -*- coding: UTF-8 -*-
|
|
3
|
+
"""Text normalization utilities for BNC lookup.
|
|
4
|
+
|
|
5
|
+
Handles normalization of Unicode apostrophe variants and other text
|
|
6
|
+
transformations to ensure consistent matching against the BNC corpus.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
# Unicode characters that should normalize to ASCII apostrophe (U+0027)
|
|
10
|
+
# Ordered by likelihood of occurrence in English text
|
|
11
|
+
APOSTROPHE_VARIANTS = (
|
|
12
|
+
'\u2019' # RIGHT SINGLE QUOTATION MARK (most common smart quote)
|
|
13
|
+
'\u2018' # LEFT SINGLE QUOTATION MARK
|
|
14
|
+
'\u0060' # GRAVE ACCENT
|
|
15
|
+
'\u00B4' # ACUTE ACCENT
|
|
16
|
+
'\u201B' # SINGLE HIGH-REVERSED-9 QUOTATION MARK
|
|
17
|
+
'\u2032' # PRIME
|
|
18
|
+
'\u2035' # REVERSED PRIME
|
|
19
|
+
'\u02B9' # MODIFIER LETTER PRIME
|
|
20
|
+
'\u02BC' # MODIFIER LETTER APOSTROPHE
|
|
21
|
+
'\u02C8' # MODIFIER LETTER VERTICAL LINE
|
|
22
|
+
'\u0313' # COMBINING COMMA ABOVE
|
|
23
|
+
'\u0315' # COMBINING COMMA ABOVE RIGHT
|
|
24
|
+
'\u055A' # ARMENIAN APOSTROPHE
|
|
25
|
+
'\u05F3' # HEBREW PUNCTUATION GERESH
|
|
26
|
+
'\u07F4' # NKO HIGH TONE APOSTROPHE
|
|
27
|
+
'\u07F5' # NKO LOW TONE APOSTROPHE
|
|
28
|
+
'\uFF07' # FULLWIDTH APOSTROPHE
|
|
29
|
+
'\u1FBF' # GREEK PSILI
|
|
30
|
+
'\u1FBD' # GREEK KORONIS
|
|
31
|
+
'\uA78C' # LATIN SMALL LETTER SALTILLO
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
# Pre-compiled translation table for fast apostrophe normalization
|
|
35
|
+
_APOSTROPHE_TABLE = str.maketrans({char: "'" for char in APOSTROPHE_VARIANTS})
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def normalize_apostrophes(text: str) -> str:
|
|
39
|
+
"""Normalize Unicode apostrophe variants to ASCII apostrophe.
|
|
40
|
+
|
|
41
|
+
Converts curly quotes, prime marks, and other apostrophe-like
|
|
42
|
+
characters to the standard ASCII apostrophe (U+0027) used in
|
|
43
|
+
the BNC corpus data.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
text: Input text potentially containing Unicode apostrophes.
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
Text with all apostrophe variants converted to ASCII apostrophe.
|
|
50
|
+
"""
|
|
51
|
+
return text.translate(_APOSTROPHE_TABLE)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def normalize(text: str) -> str:
|
|
55
|
+
"""Normalize text for BNC lookup.
|
|
56
|
+
|
|
57
|
+
Applies all normalization steps:
|
|
58
|
+
- Convert apostrophe variants to ASCII
|
|
59
|
+
- Convert to lowercase
|
|
60
|
+
- Strip leading/trailing whitespace
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
text: Input text to normalize.
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
Normalized text ready for BNC lookup.
|
|
67
|
+
"""
|
|
68
|
+
return normalize_apostrophes(text).lower().strip()
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: bnc-lookup
|
|
3
|
-
Version: 1.3.
|
|
3
|
+
Version: 1.3.2
|
|
4
4
|
Summary: Static Hash-Based Lookup for BNC Terms
|
|
5
5
|
Home-page: https://github.com/craigtrim/bnc-lookup
|
|
6
6
|
License: MIT
|
|
@@ -33,6 +33,7 @@ Description-Content-Type: text/markdown
|
|
|
33
33
|
|
|
34
34
|
[](https://badge.fury.io/py/bnc-lookup)
|
|
35
35
|
[](https://pepy.tech/project/bnc-lookup)
|
|
36
|
+
[](https://pepy.tech/project/bnc-lookup)
|
|
36
37
|
[](https://www.python.org/downloads/)
|
|
37
38
|
[](https://opensource.org/licenses/MIT)
|
|
38
39
|
[]()
|
|
@@ -101,9 +101,9 @@ bnc_lookup/bw/bw_97.py,sha256=pYgPQTbjKGEIXiaGU0P0B7wOVuY1yyt7PwT2iU3gJTA,110752
|
|
|
101
101
|
bnc_lookup/bw/bw_98.py,sha256=WMhy-xQJHGrMwC3MUAVyHFtcfwbKFUNxKL2wbJBkoJo,110145
|
|
102
102
|
bnc_lookup/bw/bw_99.py,sha256=MKGgX_qSTvI8-Q6dVh0WHyKSaRzHDL0uKtN-x_98xbU,143227
|
|
103
103
|
bnc_lookup/cli.py,sha256=36Ma2LBEgsEt2-9u0pf_u8wwM4wa0Bqx80RefBa0l7E,2211
|
|
104
|
-
bnc_lookup/find_bnc.py,sha256=
|
|
105
|
-
bnc_lookup/find_freq.py,sha256=
|
|
106
|
-
bnc_lookup/find_rf.py,sha256=
|
|
104
|
+
bnc_lookup/find_bnc.py,sha256=9IGZa6gig17F_WpO7RtQ05dACM152KAD0ohsXRljNmk,3179
|
|
105
|
+
bnc_lookup/find_freq.py,sha256=_ko6xTpf9Kb7zT3gAt65_HZJInLSZUPNDl4LOeh90M8,3140
|
|
106
|
+
bnc_lookup/find_rf.py,sha256=TMWBtpA2bWwLEuaGUp_9JYKlFnyOeiMQAOjo4OfYTuo,4229
|
|
107
107
|
bnc_lookup/find_words.py,sha256=_fRBHCuR1G3yFI69CJFEJD1jLKK47UQDPMC7-e--USQ,2206
|
|
108
108
|
bnc_lookup/freq/__init__.py,sha256=Nvot78FaLDAGJfcZ1vAGroICATR8xqtv9dzhvUeXQsM,12127
|
|
109
109
|
bnc_lookup/freq/f_00.py,sha256=Y-uDv7i6rrRQNN05fKfaX3u3EL08IXN1XBx7gLy0X00,106727
|
|
@@ -619,6 +619,7 @@ bnc_lookup/hs/h_fc.py,sha256=36PmEBdpRmYq8PCEL7g9m08mCLCQuq50uMUllCAuUdI,101225
|
|
|
619
619
|
bnc_lookup/hs/h_fd.py,sha256=N2Sh5oNn6mEBf1FKfSF9V9jWXsaCMOrj3cWgeVHvRUE,97843
|
|
620
620
|
bnc_lookup/hs/h_fe.py,sha256=90Vr4qZcGfni2nxaGUR4MxuF6Z6aRar8awM5g-Z61T0,100807
|
|
621
621
|
bnc_lookup/hs/h_ff.py,sha256=vwrosNEVBRctHXd3lC0bsY0bKMUiJnNhOLLv9Be2ouY,105595
|
|
622
|
+
bnc_lookup/normalize.py,sha256=QG4CT_9oaQ2He5uJRwDaG_G9xgIj2oRynflvQ1tGnEQ,2207
|
|
622
623
|
bnc_lookup/rf/__init__.py,sha256=hJzc1pdn9tJzJnq6ZQ6fqZABgyqrYQEBDrAbxTmyNhM,14431
|
|
623
624
|
bnc_lookup/rf/rf_00.py,sha256=W068APGlhNzO_vjmaIILu2F7ph5QZtgtde3dubNx_eA,132337
|
|
624
625
|
bnc_lookup/rf/rf_01.py,sha256=nIKNAxxpvWzWuFeOhYEe1m9ygCvlZo25Maak88jro9Y,137329
|
|
@@ -876,8 +877,8 @@ bnc_lookup/rf/rf_fc.py,sha256=8nHyAedubjFHX4_IILOxSHi3h6LONhIBqEmAX2F63Ik,138473
|
|
|
876
877
|
bnc_lookup/rf/rf_fd.py,sha256=auLl-Ldci6mJD5cG1a4bGW5opdh8A3hfIpe-JxFcCh4,133845
|
|
877
878
|
bnc_lookup/rf/rf_fe.py,sha256=Pz2T5g7IPOPsOQsJ4cF02fu9dmKj_AECR_zeE8HeZ8M,137901
|
|
878
879
|
bnc_lookup/rf/rf_ff.py,sha256=jbZ8YdL3gRuBNDMs5lLYhdy6LbUIPtKFuHnVXxP96Xo,144453
|
|
879
|
-
bnc_lookup-1.3.
|
|
880
|
-
bnc_lookup-1.3.
|
|
881
|
-
bnc_lookup-1.3.
|
|
882
|
-
bnc_lookup-1.3.
|
|
883
|
-
bnc_lookup-1.3.
|
|
880
|
+
bnc_lookup-1.3.2.dist-info/LICENSE,sha256=AYOkuUBnkCcFiW6x4tIrmFPNbW7EG2T5HkGYtxD6cFo,2124
|
|
881
|
+
bnc_lookup-1.3.2.dist-info/METADATA,sha256=qzvj-zy-bS_nAlcf_vSNSf9hc8zltT27gXv6OUSaErA,8564
|
|
882
|
+
bnc_lookup-1.3.2.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
883
|
+
bnc_lookup-1.3.2.dist-info/entry_points.txt,sha256=hh9_DCGsghIy9vqaXQH9k4GWbkyDVZWWLYhi9CC_ZJE,151
|
|
884
|
+
bnc_lookup-1.3.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|