dataknobs-xization 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dataknobs-xization might be problematic. Click here for more details.

@@ -0,0 +1,47 @@
1
+ .#*
2
+ *~
3
+ *#
4
+ .idea
5
+ .vscode
6
+ .pydevproject
7
+ venv/
8
+ .cache
9
+ **/.*env
10
+ *.pyc
11
+ **/__pycache__
12
+ .pytest_cache/
13
+ dist
14
+ .eggs/
15
+ *.egg-info
16
+ **/build
17
+ *.swp
18
+ *.orig
19
+ .project
20
+ .coverage*
21
+ _version.py.bld
22
+ .mypy_cache
23
+ **/build.log
24
+ .eggs
25
+ ignored
26
+ **/.ipynb_checkpoints
27
+ .s3_cache
28
+ .Trash-*
29
+ .DS_Store
30
+ **/_tmp
31
+ .data
32
+ *.so
33
+ .aws
34
+ VERSION
35
+ activate
36
+ .tox
37
+ .docker
38
+ .pypirc
39
+
40
+ # uv
41
+ .venv/
42
+ uv.lock
43
+ test-env/
44
+ .uv-cache/
45
+
46
+ # MkDocs documentation
47
+ site/
@@ -0,0 +1 @@
1
+ 3.9
@@ -0,0 +1,58 @@
1
+ Metadata-Version: 2.4
2
+ Name: dataknobs-xization
3
+ Version: 1.0.0
4
+ Summary: Text normalization and tokenization tools
5
+ Author-email: Spence Koehler <KoehlerSB747@gmail.com>
6
+ Requires-Python: >=3.10
7
+ Requires-Dist: dataknobs-common>=1.0.0
8
+ Requires-Dist: dataknobs-structures>=1.0.0
9
+ Requires-Dist: dataknobs-utils>=1.0.0
10
+ Requires-Dist: nltk>=3.9.1
11
+ Description-Content-Type: text/markdown
12
+
13
+ # dataknobs-xization
14
+
15
+ Text normalization and tokenization tools.
16
+
17
+ ## Installation
18
+
19
+ ```bash
20
+ pip install dataknobs-xization
21
+ ```
22
+
23
+ ## Features
24
+
25
+ - **Text Normalization**: Standardize text for consistent processing
26
+ - **Masking Tokenizer**: Advanced tokenization with masking capabilities
27
+ - **Annotations**: Text annotation system
28
+ - **Authorities**: Authority management for text processing
29
+ - **Lexicon**: Lexicon-based text analysis
30
+
31
+ ## Usage
32
+
33
+ ```python
34
+ from dataknobs_xization import normalize, MaskingTokenizer
35
+
36
+ # Text normalization
37
+ normalized = normalize.normalize_text("Hello, World!")
38
+
39
+ # Tokenization with masking
40
+ tokenizer = MaskingTokenizer()
41
+ tokens = tokenizer.tokenize("This is a sample text.")
42
+
43
+ # Working with annotations
44
+ from dataknobs_xization import annotations
45
+ doc = annotations.create_document("Sample text", {"metadata": "value"})
46
+ ```
47
+
48
+ ## Dependencies
49
+
50
+ This package depends on:
51
+ - `dataknobs-common`
52
+ - `dataknobs-structures`
53
+ - `dataknobs-utils`
54
+ - nltk
55
+
56
+ ## License
57
+
58
+ See LICENSE file in the root repository.
@@ -0,0 +1,46 @@
1
+ # dataknobs-xization
2
+
3
+ Text normalization and tokenization tools.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ pip install dataknobs-xization
9
+ ```
10
+
11
+ ## Features
12
+
13
+ - **Text Normalization**: Standardize text for consistent processing
14
+ - **Masking Tokenizer**: Advanced tokenization with masking capabilities
15
+ - **Annotations**: Text annotation system
16
+ - **Authorities**: Authority management for text processing
17
+ - **Lexicon**: Lexicon-based text analysis
18
+
19
+ ## Usage
20
+
21
+ ```python
22
+ from dataknobs_xization import normalize, MaskingTokenizer
23
+
24
+ # Text normalization
25
+ normalized = normalize.normalize_text("Hello, World!")
26
+
27
+ # Tokenization with masking
28
+ tokenizer = MaskingTokenizer()
29
+ tokens = tokenizer.tokenize("This is a sample text.")
30
+
31
+ # Working with annotations
32
+ from dataknobs_xization import annotations
33
+ doc = annotations.create_document("Sample text", {"metadata": "value"})
34
+ ```
35
+
36
+ ## Dependencies
37
+
38
+ This package depends on:
39
+ - `dataknobs-common`
40
+ - `dataknobs-structures`
41
+ - `dataknobs-utils`
42
+ - nltk
43
+
44
+ ## License
45
+
46
+ See LICENSE file in the root repository.
@@ -0,0 +1,24 @@
1
+ [project]
2
+ name = "dataknobs-xization"
3
+ version = "1.0.0"
4
+ description = "Text normalization and tokenization tools"
5
+ readme = "README.md"
6
+ authors = [
7
+ { name = "Spence Koehler", email = "KoehlerSB747@gmail.com" }
8
+ ]
9
+ requires-python = ">=3.10"
10
+ dependencies = [
11
+ "dataknobs-common>=1.0.0",
12
+ "dataknobs-structures>=1.0.0",
13
+ "dataknobs-utils>=1.0.0",
14
+ "nltk>=3.9.1",
15
+ ]
16
+
17
+ [tool.uv.sources]
18
+ dataknobs-common = { workspace = true }
19
+ dataknobs-structures = { workspace = true }
20
+ dataknobs-utils = { workspace = true }
21
+
22
+ [build-system]
23
+ requires = ["hatchling"]
24
+ build-backend = "hatchling.build"
@@ -0,0 +1,66 @@
1
+ Annotations
2
+ - annotation abbreviation: ann
3
+ - Annotations for an input string are a way to represent and organize as a table the string's tokens into records (or collections or groups) of fields,
4
+ - where
5
+ - each record has an annotation "type", e.g., an entity type
6
+ - each record field has a name and value
7
+ - where the field name is the type of field
8
+ - and the field value is the token's text
9
+ - For example: annotations
10
+ - for a "date" record (type) for the text:
11
+ - "July 4th, 1776"
12
+ - having
13
+ - "date" fields of "day", "month", and "year"
14
+ - with values of "4th", "July", "1776", respectively
15
+ - is represented, in part, as the annotations:
16
+
17
+ text ann_type date_field
18
+ July date month
19
+ 4th date day
20
+ 1776 date year
21
+
22
+ - which is equivalent to representing the "date" type of record as json:
23
+ - {"date": {"month": "July", "day": "4th", "year": "1776"}}
24
+ - with its "month", "day", and "year" fields
25
+ - and corresponding field values
26
+
27
+ - Annotations are built by annotators
28
+ - annotator abbreviation: anr
29
+ - each having an ID and a version
30
+
31
+ - Annotations are represented as a table
32
+ - each row is an annotation for a token
33
+ - with standard columns:
34
+ - anr_id, anr_ver
35
+ - Identifies the annotator
36
+ - Used for data provenance
37
+ - ann_type -- Identifies the type of annotation
38
+ - NOTE(s):
39
+ - different annotators can produce the same types of annotations
40
+ - this enables having targeted annotators for various forms or manifestations of a type of entity to be annotated
41
+ - ann_type column's values
42
+ - correspond to an annotation record type, "<ann_type>"
43
+ - where the annotation record's fields
44
+ - are specified as values in the "<ann_type>_field" column
45
+ - start_pos, end_pos
46
+ - Identifies the start and end position in the input string of the token
47
+ - text
48
+ - Holds the token text
49
+ - and non-conflicting annotator-specific columns that are "carried along"
50
+
51
+ - where ambiguities (at the token level) are represented as
52
+ - duplicated token annotation rows
53
+ - each reflecting an alternate interpretation
54
+
55
+ Processes
56
+ - An annotator produces annotations with standard columns:
57
+ - anr_id, anr_ver, ann_type, <ann_type>_field
58
+ - start_pos, end_pos, text
59
+
60
+ - An annotator service organizes
61
+ - A single annotator's annotations into field groups and entity records
62
+ - Multiple annotators' annotations into multi-entity records
63
+ - producing annotations with derived columns
64
+ - <ann_type>_num -- To distinguish annotation record instances
65
+ - <ann_type>_recsnum -- To identify mutually consistent groups of an annotation's record instances
66
+ - ec_num -- To identify mutually consistent groups of multiple annotation types
@@ -0,0 +1,16 @@
1
+ """Text normalization and tokenization tools."""
2
+
3
+ from dataknobs_xization import annotations, authorities, lexicon, masking_tokenizer, normalize
4
+ from dataknobs_xization.masking_tokenizer import CharacterFeatures, TextFeatures
5
+
6
+ __version__ = "1.0.0"
7
+
8
+ __all__ = [
9
+ "CharacterFeatures",
10
+ "TextFeatures",
11
+ "annotations",
12
+ "authorities",
13
+ "lexicon",
14
+ "masking_tokenizer",
15
+ "normalize",
16
+ ]