bnc-lookup 1.3.0__py3-none-any.whl → 1.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
bnc_lookup/find_bnc.py CHANGED
@@ -14,6 +14,8 @@ the singular form (with trailing 's' removed) is also checked.
14
14
  import hashlib
15
15
  import importlib
16
16
 
17
+ from bnc_lookup.normalize import normalize
18
+
17
19
  _cache = {}
18
20
 
19
21
 
@@ -33,7 +35,10 @@ def _get_hash_set(prefix: str) -> frozenset:
33
35
 
34
36
 
35
37
  def _calculate_md5(input_text: str) -> str:
36
- """Compute the MD5 hex digest of a normalized (lowercase, stripped) word.
38
+ """Compute the MD5 hex digest of a normalized word.
39
+
40
+ Normalization includes apostrophe variant conversion, lowercase,
41
+ and whitespace stripping.
37
42
 
38
43
  Args:
39
44
  input_text: The word to hash.
@@ -41,7 +46,7 @@ def _calculate_md5(input_text: str) -> str:
41
46
  Returns:
42
47
  32-character hexadecimal MD5 digest string.
43
48
  """
44
- return hashlib.md5(input_text.lower().strip().encode()).hexdigest()
49
+ return hashlib.md5(normalize(input_text).encode()).hexdigest()
45
50
 
46
51
 
47
52
  def _hash_exists(input_text: str) -> bool:
@@ -90,7 +95,7 @@ class FindBnc:
90
95
  Returns:
91
96
  True if the word (or its singular form) exists in the BNC.
92
97
  """
93
- input_text = input_text.lower().strip()
98
+ input_text = normalize(input_text)
94
99
 
95
100
  if _hash_exists(input_text):
96
101
  return True
bnc_lookup/find_freq.py CHANGED
@@ -14,6 +14,8 @@ bucket files (f_00.py through f_ff.py) maps hash suffixes to bucket numbers.
14
14
  import hashlib
15
15
  import importlib
16
16
 
17
+ from bnc_lookup.normalize import normalize
18
+
17
19
  _cache = {}
18
20
 
19
21
 
@@ -33,7 +35,10 @@ def _get_bucket_dict(prefix: str) -> dict:
33
35
 
34
36
 
35
37
  def _calculate_md5(input_text: str) -> str:
36
- """Compute the MD5 hex digest of a normalized (lowercase, stripped) word.
38
+ """Compute the MD5 hex digest of a normalized word.
39
+
40
+ Normalization includes apostrophe variant conversion, lowercase,
41
+ and whitespace stripping.
37
42
 
38
43
  Args:
39
44
  input_text: The word to hash.
@@ -41,7 +46,7 @@ def _calculate_md5(input_text: str) -> str:
41
46
  Returns:
42
47
  32-character hexadecimal MD5 digest string.
43
48
  """
44
- return hashlib.md5(input_text.lower().strip().encode()).hexdigest()
49
+ return hashlib.md5(normalize(input_text).encode()).hexdigest()
45
50
 
46
51
 
47
52
  def _lookup_bucket(input_text: str) -> int | None:
@@ -89,7 +94,7 @@ class FindFreq:
89
94
  1-100: Bucket number (1=most frequent, 100=least frequent)
90
95
  None: Word not found in BNC
91
96
  """
92
- input_text = input_text.lower().strip()
97
+ input_text = normalize(input_text)
93
98
 
94
99
  result = _lookup_bucket(input_text)
95
100
  if result is not None:
bnc_lookup/find_rf.py CHANGED
@@ -17,6 +17,8 @@ through rf_ff.py) maps hash suffixes to relative frequency values.
17
17
  import hashlib
18
18
  import importlib
19
19
 
20
+ from bnc_lookup.normalize import normalize
21
+
20
22
  _cache = {}
21
23
 
22
24
 
@@ -36,7 +38,10 @@ def _get_rf_dict(prefix: str) -> dict:
36
38
 
37
39
 
38
40
  def _calculate_md5(input_text: str) -> str:
39
- """Compute the MD5 hex digest of a normalized (lowercase, stripped) word.
41
+ """Compute the MD5 hex digest of a normalized word.
42
+
43
+ Normalization includes apostrophe variant conversion, lowercase,
44
+ and whitespace stripping.
40
45
 
41
46
  Args:
42
47
  input_text: The word to hash.
@@ -44,7 +49,7 @@ def _calculate_md5(input_text: str) -> str:
44
49
  Returns:
45
50
  32-character hexadecimal MD5 digest string.
46
51
  """
47
- return hashlib.md5(input_text.lower().strip().encode()).hexdigest()
52
+ return hashlib.md5(normalize(input_text).encode()).hexdigest()
48
53
 
49
54
 
50
55
  def _lookup_rf(input_text: str) -> float | None:
@@ -91,7 +96,7 @@ class FindRF:
91
96
  Returns:
92
97
  Float in range (0, 1), or None if word not in BNC.
93
98
  """
94
- input_text = input_text.lower().strip()
99
+ input_text = normalize(input_text)
95
100
 
96
101
  result = _lookup_rf(input_text)
97
102
  if result is not None:
@@ -0,0 +1,68 @@
1
+ # !/usr/bin/env python
2
+ # -*- coding: UTF-8 -*-
3
+ """Text normalization utilities for BNC lookup.
4
+
5
+ Handles normalization of Unicode apostrophe variants and other text
6
+ transformations to ensure consistent matching against the BNC corpus.
7
+ """
8
+
9
+ # Unicode characters that should normalize to ASCII apostrophe (U+0027)
10
+ # Ordered by likelihood of occurrence in English text
11
+ APOSTROPHE_VARIANTS = (
12
+ '\u2019' # RIGHT SINGLE QUOTATION MARK (most common smart quote)
13
+ '\u2018' # LEFT SINGLE QUOTATION MARK
14
+ '\u0060' # GRAVE ACCENT
15
+ '\u00B4' # ACUTE ACCENT
16
+ '\u201B' # SINGLE HIGH-REVERSED-9 QUOTATION MARK
17
+ '\u2032' # PRIME
18
+ '\u2035' # REVERSED PRIME
19
+ '\u02B9' # MODIFIER LETTER PRIME
20
+ '\u02BC' # MODIFIER LETTER APOSTROPHE
21
+ '\u02C8' # MODIFIER LETTER VERTICAL LINE
22
+ '\u0313' # COMBINING COMMA ABOVE
23
+ '\u0315' # COMBINING COMMA ABOVE RIGHT
24
+ '\u055A' # ARMENIAN APOSTROPHE
25
+ '\u05F3' # HEBREW PUNCTUATION GERESH
26
+ '\u07F4' # NKO HIGH TONE APOSTROPHE
27
+ '\u07F5' # NKO LOW TONE APOSTROPHE
28
+ '\uFF07' # FULLWIDTH APOSTROPHE
29
+ '\u1FBF' # GREEK PSILI
30
+ '\u1FBD' # GREEK KORONIS
31
+ '\uA78C' # LATIN SMALL LETTER SALTILLO
32
+ )
33
+
34
+ # Pre-compiled translation table for fast apostrophe normalization
35
+ _APOSTROPHE_TABLE = str.maketrans({char: "'" for char in APOSTROPHE_VARIANTS})
36
+
37
+
38
+ def normalize_apostrophes(text: str) -> str:
39
+ """Normalize Unicode apostrophe variants to ASCII apostrophe.
40
+
41
+ Converts curly quotes, prime marks, and other apostrophe-like
42
+ characters to the standard ASCII apostrophe (U+0027) used in
43
+ the BNC corpus data.
44
+
45
+ Args:
46
+ text: Input text potentially containing Unicode apostrophes.
47
+
48
+ Returns:
49
+ Text with all apostrophe variants converted to ASCII apostrophe.
50
+ """
51
+ return text.translate(_APOSTROPHE_TABLE)
52
+
53
+
54
+ def normalize(text: str) -> str:
55
+ """Normalize text for BNC lookup.
56
+
57
+ Applies all normalization steps:
58
+ - Convert apostrophe variants to ASCII
59
+ - Convert to lowercase
60
+ - Strip leading/trailing whitespace
61
+
62
+ Args:
63
+ text: Input text to normalize.
64
+
65
+ Returns:
66
+ Normalized text ready for BNC lookup.
67
+ """
68
+ return normalize_apostrophes(text).lower().strip()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: bnc-lookup
3
- Version: 1.3.0
3
+ Version: 1.3.2
4
4
  Summary: Static Hash-Based Lookup for BNC Terms
5
5
  Home-page: https://github.com/craigtrim/bnc-lookup
6
6
  License: MIT
@@ -33,6 +33,7 @@ Description-Content-Type: text/markdown
33
33
 
34
34
  [![PyPI version](https://badge.fury.io/py/bnc-lookup.svg)](https://badge.fury.io/py/bnc-lookup)
35
35
  [![Downloads](https://pepy.tech/badge/bnc-lookup)](https://pepy.tech/project/bnc-lookup)
36
+ [![Downloads/Month](https://pepy.tech/badge/bnc-lookup/month)](https://pepy.tech/project/bnc-lookup)
36
37
  [![Python 3.7+](https://img.shields.io/badge/python-3.7%2B-blue.svg)](https://www.python.org/downloads/)
37
38
  [![License: MIT](https://img.shields.io/badge/License-MIT-green.svg)](https://opensource.org/licenses/MIT)
38
39
  [![Tests](https://img.shields.io/badge/tests-32%20passed-brightgreen)]()
@@ -101,9 +101,9 @@ bnc_lookup/bw/bw_97.py,sha256=pYgPQTbjKGEIXiaGU0P0B7wOVuY1yyt7PwT2iU3gJTA,110752
101
101
  bnc_lookup/bw/bw_98.py,sha256=WMhy-xQJHGrMwC3MUAVyHFtcfwbKFUNxKL2wbJBkoJo,110145
102
102
  bnc_lookup/bw/bw_99.py,sha256=MKGgX_qSTvI8-Q6dVh0WHyKSaRzHDL0uKtN-x_98xbU,143227
103
103
  bnc_lookup/cli.py,sha256=36Ma2LBEgsEt2-9u0pf_u8wwM4wa0Bqx80RefBa0l7E,2211
104
- bnc_lookup/find_bnc.py,sha256=ylKY4lpQG0NV6MMZeAXU9y_Dv7gtYPPrRdBwBKt2ylc,3067
105
- bnc_lookup/find_freq.py,sha256=fe9H4N8NJYr36nwvQ2ITZ_aUxC_7VHrw50FVQNfvGZ0,3028
106
- bnc_lookup/find_rf.py,sha256=YOEvZu7-nyzLHoEtN2yKykVs11S2mcX4doj7r1YuEUQ,4117
104
+ bnc_lookup/find_bnc.py,sha256=9IGZa6gig17F_WpO7RtQ05dACM152KAD0ohsXRljNmk,3179
105
+ bnc_lookup/find_freq.py,sha256=_ko6xTpf9Kb7zT3gAt65_HZJInLSZUPNDl4LOeh90M8,3140
106
+ bnc_lookup/find_rf.py,sha256=TMWBtpA2bWwLEuaGUp_9JYKlFnyOeiMQAOjo4OfYTuo,4229
107
107
  bnc_lookup/find_words.py,sha256=_fRBHCuR1G3yFI69CJFEJD1jLKK47UQDPMC7-e--USQ,2206
108
108
  bnc_lookup/freq/__init__.py,sha256=Nvot78FaLDAGJfcZ1vAGroICATR8xqtv9dzhvUeXQsM,12127
109
109
  bnc_lookup/freq/f_00.py,sha256=Y-uDv7i6rrRQNN05fKfaX3u3EL08IXN1XBx7gLy0X00,106727
@@ -619,6 +619,7 @@ bnc_lookup/hs/h_fc.py,sha256=36PmEBdpRmYq8PCEL7g9m08mCLCQuq50uMUllCAuUdI,101225
619
619
  bnc_lookup/hs/h_fd.py,sha256=N2Sh5oNn6mEBf1FKfSF9V9jWXsaCMOrj3cWgeVHvRUE,97843
620
620
  bnc_lookup/hs/h_fe.py,sha256=90Vr4qZcGfni2nxaGUR4MxuF6Z6aRar8awM5g-Z61T0,100807
621
621
  bnc_lookup/hs/h_ff.py,sha256=vwrosNEVBRctHXd3lC0bsY0bKMUiJnNhOLLv9Be2ouY,105595
622
+ bnc_lookup/normalize.py,sha256=QG4CT_9oaQ2He5uJRwDaG_G9xgIj2oRynflvQ1tGnEQ,2207
622
623
  bnc_lookup/rf/__init__.py,sha256=hJzc1pdn9tJzJnq6ZQ6fqZABgyqrYQEBDrAbxTmyNhM,14431
623
624
  bnc_lookup/rf/rf_00.py,sha256=W068APGlhNzO_vjmaIILu2F7ph5QZtgtde3dubNx_eA,132337
624
625
  bnc_lookup/rf/rf_01.py,sha256=nIKNAxxpvWzWuFeOhYEe1m9ygCvlZo25Maak88jro9Y,137329
@@ -876,8 +877,8 @@ bnc_lookup/rf/rf_fc.py,sha256=8nHyAedubjFHX4_IILOxSHi3h6LONhIBqEmAX2F63Ik,138473
876
877
  bnc_lookup/rf/rf_fd.py,sha256=auLl-Ldci6mJD5cG1a4bGW5opdh8A3hfIpe-JxFcCh4,133845
877
878
  bnc_lookup/rf/rf_fe.py,sha256=Pz2T5g7IPOPsOQsJ4cF02fu9dmKj_AECR_zeE8HeZ8M,137901
878
879
  bnc_lookup/rf/rf_ff.py,sha256=jbZ8YdL3gRuBNDMs5lLYhdy6LbUIPtKFuHnVXxP96Xo,144453
879
- bnc_lookup-1.3.0.dist-info/LICENSE,sha256=AYOkuUBnkCcFiW6x4tIrmFPNbW7EG2T5HkGYtxD6cFo,2124
880
- bnc_lookup-1.3.0.dist-info/METADATA,sha256=Yy5pG4cEktF1zi4Rr579t1oz1KLbWuYkK3J0ax2vD8M,8463
881
- bnc_lookup-1.3.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
882
- bnc_lookup-1.3.0.dist-info/entry_points.txt,sha256=hh9_DCGsghIy9vqaXQH9k4GWbkyDVZWWLYhi9CC_ZJE,151
883
- bnc_lookup-1.3.0.dist-info/RECORD,,
880
+ bnc_lookup-1.3.2.dist-info/LICENSE,sha256=AYOkuUBnkCcFiW6x4tIrmFPNbW7EG2T5HkGYtxD6cFo,2124
881
+ bnc_lookup-1.3.2.dist-info/METADATA,sha256=qzvj-zy-bS_nAlcf_vSNSf9hc8zltT27gXv6OUSaErA,8564
882
+ bnc_lookup-1.3.2.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
883
+ bnc_lookup-1.3.2.dist-info/entry_points.txt,sha256=hh9_DCGsghIy9vqaXQH9k4GWbkyDVZWWLYhi9CC_ZJE,151
884
+ bnc_lookup-1.3.2.dist-info/RECORD,,