PyPI - dataknobs-xization - Versions diffs - 1.0.0__tar.gz - Mend

dataknobs-xization 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dataknobs-xization might be problematic. Click here for more details.

Files changed (16) hide show

dataknobs_xization-1.0.0/.gitignore +47 -0
dataknobs_xization-1.0.0/.python-version +1 -0
dataknobs_xization-1.0.0/PKG-INFO +58 -0
dataknobs_xization-1.0.0/README.md +46 -0
dataknobs_xization-1.0.0/pyproject.toml +24 -0
dataknobs_xization-1.0.0/src/dataknobs_xization/0.readme.txt +66 -0
dataknobs_xization-1.0.0/src/dataknobs_xization/__init__.py +16 -0
dataknobs_xization-1.0.0/src/dataknobs_xization/annotations.py +1308 -0
dataknobs_xization-1.0.0/src/dataknobs_xization/authorities.py +766 -0
dataknobs_xization-1.0.0/src/dataknobs_xization/lexicon.py +596 -0
dataknobs_xization-1.0.0/src/dataknobs_xization/masking_tokenizer.py +697 -0
dataknobs_xization-1.0.0/src/dataknobs_xization/normalize.py +448 -0
dataknobs_xization-1.0.0/tests/conftest.py +89 -0
dataknobs_xization-1.0.0/tests/test_authorities.py +56 -0
dataknobs_xization-1.0.0/tests/test_masking_tokenizer.py +204 -0
dataknobs_xization-1.0.0/tests/test_normalize.py +170 -0

dataknobs_xization-1.0.0/.gitignore ADDED Viewed

@@ -0,0 +1,47 @@
+.#*
+*~
+*#
+.idea
+.vscode
+.pydevproject
+venv/
+.cache
+**/.*env
+*.pyc
+**/__pycache__
+.pytest_cache/
+dist
+.eggs/
+*.egg-info
+**/build
+*.swp
+*.orig
+.project
+.coverage*
+_version.py.bld
+.mypy_cache
+**/build.log
+.eggs
+ignored
+**/.ipynb_checkpoints
+.s3_cache
+.Trash-*
+.DS_Store
+**/_tmp
+.data
+*.so
+.aws
+VERSION
+activate
+.tox
+.docker
+.pypirc
+# uv
+.venv/
+uv.lock
+test-env/
+.uv-cache/
+# MkDocs documentation
+site/

dataknobs_xization-1.0.0/.python-version ADDED Viewed

	@@ -0,0 +1 @@
1	+ 3.9

dataknobs_xization-1.0.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,58 @@
+Metadata-Version: 2.4
+Name: dataknobs-xization
+Version: 1.0.0
+Summary: Text normalization and tokenization tools
+Author-email: Spence Koehler <KoehlerSB747@gmail.com>
+Requires-Python: >=3.10
+Requires-Dist: dataknobs-common>=1.0.0
+Requires-Dist: dataknobs-structures>=1.0.0
+Requires-Dist: dataknobs-utils>=1.0.0
+Requires-Dist: nltk>=3.9.1
+Description-Content-Type: text/markdown
+# dataknobs-xization
+Text normalization and tokenization tools.
+## Installation
+```bash
+pip install dataknobs-xization
+```
+## Features
+- **Text Normalization**: Standardize text for consistent processing
+- **Masking Tokenizer**: Advanced tokenization with masking capabilities
+- **Annotations**: Text annotation system
+- **Authorities**: Authority management for text processing
+- **Lexicon**: Lexicon-based text analysis
+## Usage
+```python
+from dataknobs_xization import normalize, MaskingTokenizer
+# Text normalization
+normalized = normalize.normalize_text("Hello, World!")
+# Tokenization with masking
+tokenizer = MaskingTokenizer()
+tokens = tokenizer.tokenize("This is a sample text.")
+# Working with annotations
+from dataknobs_xization import annotations
+doc = annotations.create_document("Sample text", {"metadata": "value"})
+```
+## Dependencies
+This package depends on:
+- `dataknobs-common`
+- `dataknobs-structures`
+- `dataknobs-utils`
+- nltk
+## License
+See LICENSE file in the root repository.

dataknobs_xization-1.0.0/README.md ADDED Viewed

@@ -0,0 +1,46 @@
+# dataknobs-xization
+Text normalization and tokenization tools.
+## Installation
+```bash
+pip install dataknobs-xization
+```
+## Features
+- **Text Normalization**: Standardize text for consistent processing
+- **Masking Tokenizer**: Advanced tokenization with masking capabilities
+- **Annotations**: Text annotation system
+- **Authorities**: Authority management for text processing
+- **Lexicon**: Lexicon-based text analysis
+## Usage
+```python
+from dataknobs_xization import normalize, MaskingTokenizer
+# Text normalization
+normalized = normalize.normalize_text("Hello, World!")
+# Tokenization with masking
+tokenizer = MaskingTokenizer()
+tokens = tokenizer.tokenize("This is a sample text.")
+# Working with annotations
+from dataknobs_xization import annotations
+doc = annotations.create_document("Sample text", {"metadata": "value"})
+```
+## Dependencies
+This package depends on:
+- `dataknobs-common`
+- `dataknobs-structures`
+- `dataknobs-utils`
+- nltk
+## License
+See LICENSE file in the root repository.

dataknobs_xization-1.0.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,24 @@
+[project]
+name = "dataknobs-xization"
+version = "1.0.0"
+description = "Text normalization and tokenization tools"
+readme = "README.md"
+authors = [
+    { name = "Spence Koehler", email = "KoehlerSB747@gmail.com" }
+]
+requires-python = ">=3.10"
+dependencies = [
+    "dataknobs-common>=1.0.0",
+    "dataknobs-structures>=1.0.0",
+    "dataknobs-utils>=1.0.0",
+    "nltk>=3.9.1",
+]
+[tool.uv.sources]
+dataknobs-common = { workspace = true }
+dataknobs-structures = { workspace = true }
+dataknobs-utils = { workspace = true }
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"

dataknobs_xization-1.0.0/src/dataknobs_xization/0.readme.txt ADDED Viewed

@@ -0,0 +1,66 @@
+Annotations
+- annotation abbreviation: ann
+  - Annotations for an input string are a way to represent and organize as a table the string's tokens into records (or collections or groups) of fields,
+    - where
+      - each record has an annotation "type", e.g., an entity type
+      - each record field has a name and value
+        - where the field name is the type of field
+        - and the field value is the token's text
+    - For example: annotations
+      - for a "date" record (type) for the text:
+        - "July 4th, 1776"
+      - having
+        - "date" fields of "day", "month", and "year"
+        - with values of "4th", "July", "1776", respectively
+      - is represented, in part, as the annotations:
+        text   ann_type   date_field
+        July     date       month
+        4th      date       day
+        1776     date       year
+      - which is equivalent to representing the "date" type of record as json:
+        - {"date": {"month": "July", "day": "4th", "year": "1776"}}
+        - with its "month", "day", and "year" fields
+          - and corresponding field values
+- Annotations are built by annotators
+  - annotator abbreviation: anr
+  - each having an ID and a version
+- Annotations are represented as a table
+  - each row is an annotation for a token
+  - with standard columns:
+    - anr_id, anr_ver
+      - Identifies the annotator
+      - Used for data provenance
+    - ann_type -- Identifies the type of annotation
+      - NOTE(s):
+        - different annotators can produce the same types of annotations
+          - this enables having targeted annotators for various forms or manifestations of a type of entity to be annotated
+      - ann_type column's values
+        - correspond to an annotation record type, "<ann_type>"
+          - where the annotation record's fields
+            - are specified as values in the "<ann_type>_field" column
+    - start_pos, end_pos
+      - Identifies the start and end position in the input string of the token
+    - text
+      - Holds the token text
+  - and non-conflicting annotator-specific columns that are "carried along"
+  - where ambiguities (at the token level) are represented as
+    - duplicated token annotation rows
+      - each reflecting an alternate interpretation
+Processes
+- An annotator produces annotations with standard columns:
+  - anr_id, anr_ver, ann_type, <ann_type>_field
+  - start_pos, end_pos, text
+- An annotator service organizes
+  - A single annotator's annotations into field groups and entity records
+  - Multiple annotators' annotations into multi-entity records
+  - producing annotations with derived columns
+    - <ann_type>_num -- To distinguish annotation record instances
+    - <ann_type>_recsnum -- To identify mutually consistent groups of an annotation's record instances
+    - ec_num -- To identify mutually consistent groups of multiple annotation types

dataknobs_xization-1.0.0/src/dataknobs_xization/__init__.py ADDED Viewed

@@ -0,0 +1,16 @@
+"""Text normalization and tokenization tools."""
+from dataknobs_xization import annotations, authorities, lexicon, masking_tokenizer, normalize
+from dataknobs_xization.masking_tokenizer import CharacterFeatures, TextFeatures
+__version__ = "1.0.0"
+__all__ = [
+    "CharacterFeatures",
+    "TextFeatures",
+    "annotations",
+    "authorities",
+    "lexicon",
+    "masking_tokenizer",
+    "normalize",
+]