dataknobs-xization 1.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataknobs_xization/0.readme.txt +66 -0
- dataknobs_xization/__init__.py +110 -0
- dataknobs_xization/annotations.py +1476 -0
- dataknobs_xization/authorities.py +860 -0
- dataknobs_xization/content_transformer.py +570 -0
- dataknobs_xization/ingestion/__init__.py +27 -0
- dataknobs_xization/ingestion/config.py +352 -0
- dataknobs_xization/ingestion/processor.py +367 -0
- dataknobs_xization/json/__init__.py +17 -0
- dataknobs_xization/json/json_chunker.py +591 -0
- dataknobs_xization/lexicon.py +723 -0
- dataknobs_xization/markdown/__init__.py +72 -0
- dataknobs_xization/markdown/enrichment.py +260 -0
- dataknobs_xization/markdown/filters.py +236 -0
- dataknobs_xization/markdown/md_chunker.py +478 -0
- dataknobs_xization/markdown/md_parser.py +605 -0
- dataknobs_xization/markdown/md_streaming.py +302 -0
- dataknobs_xization/masking_tokenizer.py +768 -0
- dataknobs_xization/normalize.py +520 -0
- dataknobs_xization/py.typed +0 -0
- dataknobs_xization-1.2.3.dist-info/METADATA +170 -0
- dataknobs_xization-1.2.3.dist-info/RECORD +23 -0
- dataknobs_xization-1.2.3.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
Annotations
|
|
2
|
+
- annotation abbreviation: ann
|
|
3
|
+
- Annotations for an input string are a way to represent and organize as a table the string's tokens into records (or collections or groups) of fields,
|
|
4
|
+
- where
|
|
5
|
+
- each record has an annotation "type", e.g., an entity type
|
|
6
|
+
- each record field has a name and value
|
|
7
|
+
- where the field name is the type of field
|
|
8
|
+
- and the field value is the token's text
|
|
9
|
+
- For example: annotations
|
|
10
|
+
- for a "date" record (type) for the text:
|
|
11
|
+
- "July 4th, 1776"
|
|
12
|
+
- having
|
|
13
|
+
- "date" fields of "day", "month", and "year"
|
|
14
|
+
- with values of "4th", "July", "1776", respectively
|
|
15
|
+
- is represented, in part, as the annotations:
|
|
16
|
+
|
|
17
|
+
text ann_type date_field
|
|
18
|
+
July date month
|
|
19
|
+
4th date day
|
|
20
|
+
1776 date year
|
|
21
|
+
|
|
22
|
+
- which is equivalent to representing the "date" type of record as json:
|
|
23
|
+
- {"date": {"month": "July", "day": "4th", "year": "1776"}}
|
|
24
|
+
- with its "month", "day", and "year" fields
|
|
25
|
+
- and corresponding field values
|
|
26
|
+
|
|
27
|
+
- Annotations are built by annotators
|
|
28
|
+
- annotator abbreviation: anr
|
|
29
|
+
- each having an ID and a version
|
|
30
|
+
|
|
31
|
+
- Annotations are represented as a table
|
|
32
|
+
- each row is an annotation for a token
|
|
33
|
+
- with standard columns:
|
|
34
|
+
- anr_id, anr_ver
|
|
35
|
+
- Identifies the annotator
|
|
36
|
+
- Used for data provenance
|
|
37
|
+
- ann_type -- Identifies the type of annotation
|
|
38
|
+
- NOTE(s):
|
|
39
|
+
- different annotators can produce the same types of annotations
|
|
40
|
+
- this enables having targeted annotators for various forms or manifestations of a type of entity to be annotated
|
|
41
|
+
- ann_type column's values
|
|
42
|
+
- correspond to an annotation record type, "<ann_type>"
|
|
43
|
+
- where the annotation record's fields
|
|
44
|
+
- are specified as values in the "<ann_type>_field" column
|
|
45
|
+
- start_pos, end_pos
|
|
46
|
+
- Identifies the start and end position in the input string of the token
|
|
47
|
+
- text
|
|
48
|
+
- Holds the token text
|
|
49
|
+
- and non-conflicting annotator-specific columns that are "carried along"
|
|
50
|
+
|
|
51
|
+
- where ambiguities (at the token level) are represented as
|
|
52
|
+
- duplicated token annotation rows
|
|
53
|
+
- each reflecting an alternate interpretation
|
|
54
|
+
|
|
55
|
+
Processes
|
|
56
|
+
- An annotator produces annotations with standard columns:
|
|
57
|
+
- anr_id, anr_ver, ann_type, <ann_type>_field
|
|
58
|
+
- start_pos, end_pos, text
|
|
59
|
+
|
|
60
|
+
- An annotator service organizes
|
|
61
|
+
- A single annotator's annotations into field groups and entity records
|
|
62
|
+
- Multiple annotators' annotations into multi-entity records
|
|
63
|
+
- producing annotations with derived columns
|
|
64
|
+
- <ann_type>_num -- To distinguish annotation record instances
|
|
65
|
+
- <ann_type>_recsnum -- To identify mutually consistent groups of an annotation's record instances
|
|
66
|
+
- ec_num -- To identify mutually consistent groups of multiple annotation types
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
"""Text normalization and tokenization tools."""
|
|
2
|
+
|
|
3
|
+
from dataknobs_xization import (
|
|
4
|
+
annotations,
|
|
5
|
+
authorities,
|
|
6
|
+
content_transformer,
|
|
7
|
+
ingestion,
|
|
8
|
+
json,
|
|
9
|
+
lexicon,
|
|
10
|
+
markdown,
|
|
11
|
+
masking_tokenizer,
|
|
12
|
+
normalize,
|
|
13
|
+
)
|
|
14
|
+
from dataknobs_xization.content_transformer import (
|
|
15
|
+
ContentTransformer,
|
|
16
|
+
csv_to_markdown,
|
|
17
|
+
json_to_markdown,
|
|
18
|
+
yaml_to_markdown,
|
|
19
|
+
)
|
|
20
|
+
from dataknobs_xization.markdown import (
|
|
21
|
+
AdaptiveStreamingProcessor,
|
|
22
|
+
Chunk,
|
|
23
|
+
ChunkFormat,
|
|
24
|
+
ChunkMetadata,
|
|
25
|
+
ChunkQualityConfig,
|
|
26
|
+
ChunkQualityFilter,
|
|
27
|
+
EnrichedChunkData,
|
|
28
|
+
HeadingInclusion,
|
|
29
|
+
MarkdownChunker,
|
|
30
|
+
MarkdownNode,
|
|
31
|
+
MarkdownParser,
|
|
32
|
+
StreamingMarkdownProcessor,
|
|
33
|
+
build_enriched_text,
|
|
34
|
+
chunk_markdown_tree,
|
|
35
|
+
format_heading_display,
|
|
36
|
+
get_dynamic_heading_display,
|
|
37
|
+
is_multiword,
|
|
38
|
+
parse_markdown,
|
|
39
|
+
stream_markdown_file,
|
|
40
|
+
stream_markdown_string,
|
|
41
|
+
)
|
|
42
|
+
from dataknobs_xization.masking_tokenizer import CharacterFeatures, TextFeatures
|
|
43
|
+
from dataknobs_xization.json import (
|
|
44
|
+
JSONChunk,
|
|
45
|
+
JSONChunkConfig,
|
|
46
|
+
JSONChunker,
|
|
47
|
+
)
|
|
48
|
+
from dataknobs_xization.ingestion import (
|
|
49
|
+
DirectoryProcessor,
|
|
50
|
+
FilePatternConfig,
|
|
51
|
+
IngestionConfigError,
|
|
52
|
+
KnowledgeBaseConfig,
|
|
53
|
+
ProcessedDocument,
|
|
54
|
+
process_directory,
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
__version__ = "1.2.3"
|
|
58
|
+
|
|
59
|
+
__all__ = [
|
|
60
|
+
# Existing exports
|
|
61
|
+
"CharacterFeatures",
|
|
62
|
+
"TextFeatures",
|
|
63
|
+
"annotations",
|
|
64
|
+
"authorities",
|
|
65
|
+
"content_transformer",
|
|
66
|
+
"lexicon",
|
|
67
|
+
"masking_tokenizer",
|
|
68
|
+
"normalize",
|
|
69
|
+
# Content transformation
|
|
70
|
+
"ContentTransformer",
|
|
71
|
+
"csv_to_markdown",
|
|
72
|
+
"json_to_markdown",
|
|
73
|
+
"yaml_to_markdown",
|
|
74
|
+
# JSON module
|
|
75
|
+
"json",
|
|
76
|
+
"JSONChunk",
|
|
77
|
+
"JSONChunkConfig",
|
|
78
|
+
"JSONChunker",
|
|
79
|
+
# Markdown module
|
|
80
|
+
"markdown",
|
|
81
|
+
# Markdown chunking classes and functions
|
|
82
|
+
"AdaptiveStreamingProcessor",
|
|
83
|
+
"Chunk",
|
|
84
|
+
"ChunkFormat",
|
|
85
|
+
"ChunkMetadata",
|
|
86
|
+
"ChunkQualityConfig",
|
|
87
|
+
"ChunkQualityFilter",
|
|
88
|
+
"EnrichedChunkData",
|
|
89
|
+
"HeadingInclusion",
|
|
90
|
+
"MarkdownChunker",
|
|
91
|
+
"MarkdownNode",
|
|
92
|
+
"MarkdownParser",
|
|
93
|
+
"StreamingMarkdownProcessor",
|
|
94
|
+
"build_enriched_text",
|
|
95
|
+
"chunk_markdown_tree",
|
|
96
|
+
"format_heading_display",
|
|
97
|
+
"get_dynamic_heading_display",
|
|
98
|
+
"is_multiword",
|
|
99
|
+
"parse_markdown",
|
|
100
|
+
"stream_markdown_file",
|
|
101
|
+
"stream_markdown_string",
|
|
102
|
+
# Ingestion module
|
|
103
|
+
"ingestion",
|
|
104
|
+
"DirectoryProcessor",
|
|
105
|
+
"FilePatternConfig",
|
|
106
|
+
"IngestionConfigError",
|
|
107
|
+
"KnowledgeBaseConfig",
|
|
108
|
+
"ProcessedDocument",
|
|
109
|
+
"process_directory",
|
|
110
|
+
]
|