dataknobs-xization 1.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,66 @@
1
+ Annotations
2
+ - annotation abbreviation: ann
3
+ - Annotations for an input string are a way to represent and organize as a table the string's tokens into records (or collections or groups) of fields,
4
+ - where
5
+ - each record has an annotation "type", e.g., an entity type
6
+ - each record field has a name and value
7
+ - where the field name is the type of field
8
+ - and the field value is the token's text
9
+ - For example: annotations
10
+ - for a "date" record (type) for the text:
11
+ - "July 4th, 1776"
12
+ - having
13
+ - "date" fields of "day", "month", and "year"
14
+ - with values of "4th", "July", "1776", respectively
15
+ - is represented, in part, as the annotations:
16
+
17
+ text ann_type date_field
18
+ July date month
19
+ 4th date day
20
+ 1776 date year
21
+
22
+ - which is equivalent to representing the "date" type of record as json:
23
+ - {"date": {"month": "July", "day": "4th", "year": "1776"}}
24
+ - with its "month", "day", and "year" fields
25
+ - and corresponding field values
26
+
27
+ - Annotations are built by annotators
28
+ - annotator abbreviation: anr
29
+ - each having an ID and a version
30
+
31
+ - Annotations are represented as a table
32
+ - each row is an annotation for a token
33
+ - with standard columns:
34
+ - anr_id, anr_ver
35
+ - Identifies the annotator
36
+ - Used for data provenance
37
+ - ann_type -- Identifies the type of annotation
38
+ - NOTE(s):
39
+ - different annotators can produce the same types of annotations
40
+ - this enables having targeted annotators for various forms or manifestations of a type of entity to be annotated
41
+ - ann_type column's values
42
+ - correspond to an annotation record type, "<ann_type>"
43
+ - where the annotation record's fields
44
+ - are specified as values in the "<ann_type>_field" column
45
+ - start_pos, end_pos
46
+ - Identifies the start and end position in the input string of the token
47
+ - text
48
+ - Holds the token text
49
+ - and non-conflicting annotator-specific columns that are "carried along"
50
+
51
+ - where ambiguities (at the token level) are represented as
52
+ - duplicated token annotation rows
53
+ - each reflecting an alternate interpretation
54
+
55
+ Processes
56
+ - An annotator produces annotations with standard columns:
57
+ - anr_id, anr_ver, ann_type, <ann_type>_field
58
+ - start_pos, end_pos, text
59
+
60
+ - An annotator service organizes
61
+ - A single annotator's annotations into field groups and entity records
62
+ - Multiple annotators' annotations into multi-entity records
63
+ - producing annotations with derived columns
64
+ - <ann_type>_num -- To distinguish annotation record instances
65
+ - <ann_type>_recsnum -- To identify mutually consistent groups of an annotation's record instances
66
+ - ec_num -- To identify mutually consistent groups of multiple annotation types
@@ -0,0 +1,110 @@
1
+ """Text normalization and tokenization tools."""
2
+
3
+ from dataknobs_xization import (
4
+ annotations,
5
+ authorities,
6
+ content_transformer,
7
+ ingestion,
8
+ json,
9
+ lexicon,
10
+ markdown,
11
+ masking_tokenizer,
12
+ normalize,
13
+ )
14
+ from dataknobs_xization.content_transformer import (
15
+ ContentTransformer,
16
+ csv_to_markdown,
17
+ json_to_markdown,
18
+ yaml_to_markdown,
19
+ )
20
+ from dataknobs_xization.markdown import (
21
+ AdaptiveStreamingProcessor,
22
+ Chunk,
23
+ ChunkFormat,
24
+ ChunkMetadata,
25
+ ChunkQualityConfig,
26
+ ChunkQualityFilter,
27
+ EnrichedChunkData,
28
+ HeadingInclusion,
29
+ MarkdownChunker,
30
+ MarkdownNode,
31
+ MarkdownParser,
32
+ StreamingMarkdownProcessor,
33
+ build_enriched_text,
34
+ chunk_markdown_tree,
35
+ format_heading_display,
36
+ get_dynamic_heading_display,
37
+ is_multiword,
38
+ parse_markdown,
39
+ stream_markdown_file,
40
+ stream_markdown_string,
41
+ )
42
+ from dataknobs_xization.masking_tokenizer import CharacterFeatures, TextFeatures
43
+ from dataknobs_xization.json import (
44
+ JSONChunk,
45
+ JSONChunkConfig,
46
+ JSONChunker,
47
+ )
48
+ from dataknobs_xization.ingestion import (
49
+ DirectoryProcessor,
50
+ FilePatternConfig,
51
+ IngestionConfigError,
52
+ KnowledgeBaseConfig,
53
+ ProcessedDocument,
54
+ process_directory,
55
+ )
56
+
57
+ __version__ = "1.2.3"
58
+
59
+ __all__ = [
60
+ # Existing exports
61
+ "CharacterFeatures",
62
+ "TextFeatures",
63
+ "annotations",
64
+ "authorities",
65
+ "content_transformer",
66
+ "lexicon",
67
+ "masking_tokenizer",
68
+ "normalize",
69
+ # Content transformation
70
+ "ContentTransformer",
71
+ "csv_to_markdown",
72
+ "json_to_markdown",
73
+ "yaml_to_markdown",
74
+ # JSON module
75
+ "json",
76
+ "JSONChunk",
77
+ "JSONChunkConfig",
78
+ "JSONChunker",
79
+ # Markdown module
80
+ "markdown",
81
+ # Markdown chunking classes and functions
82
+ "AdaptiveStreamingProcessor",
83
+ "Chunk",
84
+ "ChunkFormat",
85
+ "ChunkMetadata",
86
+ "ChunkQualityConfig",
87
+ "ChunkQualityFilter",
88
+ "EnrichedChunkData",
89
+ "HeadingInclusion",
90
+ "MarkdownChunker",
91
+ "MarkdownNode",
92
+ "MarkdownParser",
93
+ "StreamingMarkdownProcessor",
94
+ "build_enriched_text",
95
+ "chunk_markdown_tree",
96
+ "format_heading_display",
97
+ "get_dynamic_heading_display",
98
+ "is_multiword",
99
+ "parse_markdown",
100
+ "stream_markdown_file",
101
+ "stream_markdown_string",
102
+ # Ingestion module
103
+ "ingestion",
104
+ "DirectoryProcessor",
105
+ "FilePatternConfig",
106
+ "IngestionConfigError",
107
+ "KnowledgeBaseConfig",
108
+ "ProcessedDocument",
109
+ "process_directory",
110
+ ]