graflo 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of graflo might be problematic. Click here for more details.
- graflo/README.md +18 -0
- graflo/__init__.py +39 -0
- graflo/architecture/__init__.py +37 -0
- graflo/architecture/actor.py +974 -0
- graflo/architecture/actor_util.py +425 -0
- graflo/architecture/edge.py +295 -0
- graflo/architecture/onto.py +374 -0
- graflo/architecture/resource.py +161 -0
- graflo/architecture/schema.py +136 -0
- graflo/architecture/transform.py +292 -0
- graflo/architecture/util.py +93 -0
- graflo/architecture/vertex.py +277 -0
- graflo/caster.py +409 -0
- graflo/cli/__init__.py +14 -0
- graflo/cli/ingest.py +144 -0
- graflo/cli/manage_dbs.py +193 -0
- graflo/cli/plot_schema.py +132 -0
- graflo/cli/xml2json.py +93 -0
- graflo/db/__init__.py +32 -0
- graflo/db/arango/__init__.py +16 -0
- graflo/db/arango/conn.py +734 -0
- graflo/db/arango/query.py +180 -0
- graflo/db/arango/util.py +88 -0
- graflo/db/connection.py +304 -0
- graflo/db/manager.py +104 -0
- graflo/db/neo4j/__init__.py +16 -0
- graflo/db/neo4j/conn.py +432 -0
- graflo/db/util.py +49 -0
- graflo/filter/__init__.py +21 -0
- graflo/filter/onto.py +400 -0
- graflo/logging.conf +22 -0
- graflo/onto.py +186 -0
- graflo/plot/__init__.py +17 -0
- graflo/plot/plotter.py +556 -0
- graflo/util/__init__.py +23 -0
- graflo/util/chunker.py +739 -0
- graflo/util/merge.py +148 -0
- graflo/util/misc.py +37 -0
- graflo/util/onto.py +63 -0
- graflo/util/transform.py +406 -0
- graflo-1.1.0.dist-info/METADATA +157 -0
- graflo-1.1.0.dist-info/RECORD +45 -0
- graflo-1.1.0.dist-info/WHEEL +4 -0
- graflo-1.1.0.dist-info/entry_points.txt +5 -0
- graflo-1.1.0.dist-info/licenses/LICENSE +126 -0
graflo/util/merge.py
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
"""Document merging and discrimination utilities.
|
|
2
|
+
|
|
3
|
+
This module provides functions for merging and discriminating between documents
|
|
4
|
+
based on various criteria. It supports merging documents with common keys,
|
|
5
|
+
discriminating based on specific values, and handling different document structures.
|
|
6
|
+
|
|
7
|
+
Key Functions:
|
|
8
|
+
- discriminate_by_key: Filter documents based on index fields and key presence
|
|
9
|
+
- merge_doc_basis: Merge documents based on common index keys
|
|
10
|
+
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from graflo.architecture.onto import VertexRep
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def discriminate_by_key(items, indexes, discriminant_key, fast=False):
|
|
17
|
+
"""Filter documents based on index fields and key presence.
|
|
18
|
+
|
|
19
|
+
This function filters a list of documents based on the presence of index fields
|
|
20
|
+
and a specific key. It can operate in fast mode to return after finding the
|
|
21
|
+
first match.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
items: List of documents (dictionaries) to filter
|
|
25
|
+
indexes: List of index field names to check for presence
|
|
26
|
+
discriminant_key: Key to check for presence
|
|
27
|
+
fast: Whether to return after first match (default: False)
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
list[dict]: Filtered list of documents
|
|
31
|
+
"""
|
|
32
|
+
# pick items that have any of index field present
|
|
33
|
+
_items = [item for item in items if any(k in item for k in indexes)]
|
|
34
|
+
|
|
35
|
+
if discriminant_key is not None:
|
|
36
|
+
result = []
|
|
37
|
+
for item in _items:
|
|
38
|
+
if discriminant_key in item:
|
|
39
|
+
result += [item]
|
|
40
|
+
if fast:
|
|
41
|
+
break
|
|
42
|
+
return result
|
|
43
|
+
return _items
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def merge_doc_basis(
|
|
47
|
+
docs: list[dict],
|
|
48
|
+
index_keys: tuple[str, ...],
|
|
49
|
+
discriminant_key=None,
|
|
50
|
+
) -> list[dict]:
|
|
51
|
+
"""Merge documents based on common index keys.
|
|
52
|
+
|
|
53
|
+
This function merges documents that share common index key-value combinations.
|
|
54
|
+
Documents without index keys are merged with the first relevant document that
|
|
55
|
+
has the discriminant key.
|
|
56
|
+
|
|
57
|
+
Note:
|
|
58
|
+
Currently works best with two groups of documents: those with and without
|
|
59
|
+
the discriminant key. Future versions will support multiple discriminant
|
|
60
|
+
value groups.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
docs: List of documents to merge
|
|
64
|
+
index_keys: Tuple of key names to use for merging
|
|
65
|
+
discriminant_key: Optional key to use for merging documents without index keys
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
list[dict]: Merged documents
|
|
69
|
+
"""
|
|
70
|
+
docs_tuplezied = [
|
|
71
|
+
tuple(sorted((k, v) for k, v in item.items() if k in index_keys))
|
|
72
|
+
for item in docs
|
|
73
|
+
]
|
|
74
|
+
|
|
75
|
+
# pick bearing docs : those that differ by index_keys
|
|
76
|
+
bearing_docs: dict[tuple, dict] = {q: dict() for q in set(docs_tuplezied)}
|
|
77
|
+
|
|
78
|
+
# merge docs with respect to unique index key-value combinations
|
|
79
|
+
for doc, doc_tuple in zip(docs, docs_tuplezied):
|
|
80
|
+
bearing_docs[doc_tuple].update(doc)
|
|
81
|
+
|
|
82
|
+
# merge docs without any index keys onto the first relevant doc
|
|
83
|
+
if () in docs_tuplezied:
|
|
84
|
+
relevant_docs = discriminate_by_key(
|
|
85
|
+
docs, index_keys, discriminant_key, fast=True
|
|
86
|
+
)
|
|
87
|
+
if relevant_docs:
|
|
88
|
+
tuple_ix = tuple(
|
|
89
|
+
sorted((k, v) for k, v in relevant_docs[0].items() if k in index_keys)
|
|
90
|
+
)
|
|
91
|
+
bearing_docs[tuple_ix].update(bearing_docs.pop(()))
|
|
92
|
+
|
|
93
|
+
return list(bearing_docs.values())
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def merge_doc_basis_closest_preceding(
|
|
97
|
+
docs: list[VertexRep],
|
|
98
|
+
index_keys: tuple[str, ...],
|
|
99
|
+
) -> list[VertexRep]:
|
|
100
|
+
"""Merge VertexRep documents based on index_keys.
|
|
101
|
+
|
|
102
|
+
Leading non-ID VertexReps are merged into the first ID VertexRep.
|
|
103
|
+
Remaining non-ID VertexReps are merged into the closest preceding ID VertexRep.
|
|
104
|
+
The merge is performed on the `vertex` attribute, and `ctx` dicts are merged among merged VertexReps.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
docs: List of VertexRep to merge
|
|
108
|
+
index_keys: Tuple of key names to use for merging
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
list[VertexRep]: Merged VertexReps
|
|
112
|
+
"""
|
|
113
|
+
merged_docs: list[VertexRep] = []
|
|
114
|
+
pending_non_ids: list[VertexRep] = []
|
|
115
|
+
|
|
116
|
+
def merge_vertex_ctx(target: VertexRep, sources: list[VertexRep]):
|
|
117
|
+
# Merge vertex dicts
|
|
118
|
+
for src in sources:
|
|
119
|
+
target.vertex.update(src.vertex)
|
|
120
|
+
target.ctx.update(src.ctx)
|
|
121
|
+
return target
|
|
122
|
+
|
|
123
|
+
for doc in docs:
|
|
124
|
+
if any(k in doc.vertex for k in index_keys):
|
|
125
|
+
# This is an ID VertexRep
|
|
126
|
+
# First, handle any accumulated non-ID VertexReps
|
|
127
|
+
if pending_non_ids:
|
|
128
|
+
if not merged_docs:
|
|
129
|
+
# No previous ID doc, create new one with accumulated non-IDs
|
|
130
|
+
merged_doc = VertexRep(vertex={}, ctx={})
|
|
131
|
+
merged_doc = merge_vertex_ctx(merged_doc, pending_non_ids)
|
|
132
|
+
merged_docs.append(merged_doc)
|
|
133
|
+
else:
|
|
134
|
+
# Merge accumulated non-IDs into the last ID doc
|
|
135
|
+
merged_docs[-1] = merge_vertex_ctx(merged_docs[-1], pending_non_ids)
|
|
136
|
+
pending_non_ids.clear()
|
|
137
|
+
|
|
138
|
+
# Add the current ID VertexRep (make a copy to avoid mutating input)
|
|
139
|
+
merged_docs.append(VertexRep(vertex=doc.vertex.copy(), ctx=doc.ctx.copy()))
|
|
140
|
+
else:
|
|
141
|
+
# This is a non-ID VertexRep, accumulate it
|
|
142
|
+
pending_non_ids.append(doc)
|
|
143
|
+
|
|
144
|
+
# Handle any remaining non-ID VertexReps at the end
|
|
145
|
+
if pending_non_ids and merged_docs:
|
|
146
|
+
merged_docs[-1] = merge_vertex_ctx(merged_docs[-1], pending_non_ids)
|
|
147
|
+
|
|
148
|
+
return merged_docs
|
graflo/util/misc.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
"""Miscellaneous utility functions.
|
|
2
|
+
|
|
3
|
+
This module provides various utility functions for data manipulation and processing.
|
|
4
|
+
|
|
5
|
+
Key Functions:
|
|
6
|
+
- sorted_dicts: Recursively sort dictionaries and lists for consistent ordering
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def sorted_dicts(d):
|
|
11
|
+
"""Recursively sort dictionaries and lists for consistent ordering.
|
|
12
|
+
|
|
13
|
+
This function recursively sorts dictionaries and lists to ensure consistent
|
|
14
|
+
ordering of data structures. It handles nested structures and preserves
|
|
15
|
+
non-collection values.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
d: Data structure to sort (dict, list, tuple, or other)
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
The sorted data structure with consistent ordering
|
|
22
|
+
|
|
23
|
+
Example:
|
|
24
|
+
>>> data = {"b": 2, "a": 1, "c": [3, 1, 2]}
|
|
25
|
+
>>> sorted_dicts(data)
|
|
26
|
+
{"a": 1, "b": 2, "c": [1, 2, 3]}
|
|
27
|
+
"""
|
|
28
|
+
if isinstance(d, (tuple, list)):
|
|
29
|
+
if d and all([not isinstance(dd, (list, tuple, dict)) for dd in d[0].values()]):
|
|
30
|
+
return sorted(d, key=lambda x: tuple(x.items()))
|
|
31
|
+
elif isinstance(d, dict):
|
|
32
|
+
return {
|
|
33
|
+
k: v if not isinstance(v, (list, tuple, dict)) else sorted_dicts(v)
|
|
34
|
+
for k, v in d.items()
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
return d
|
graflo/util/onto.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""Utility ontology classes for file patterns and configurations.
|
|
2
|
+
|
|
3
|
+
This module provides data classes for managing file patterns and configurations
|
|
4
|
+
used throughout the system. These classes support file discovery, pattern matching,
|
|
5
|
+
and configuration management.
|
|
6
|
+
|
|
7
|
+
Key Components:
|
|
8
|
+
- FilePattern: Configuration for file pattern matching
|
|
9
|
+
- Patterns: Collection of named file patterns
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import dataclasses
|
|
13
|
+
import pathlib
|
|
14
|
+
|
|
15
|
+
from graflo.onto import BaseDataclass
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclasses.dataclass
|
|
19
|
+
class FilePattern(BaseDataclass):
|
|
20
|
+
"""Configuration for file pattern matching.
|
|
21
|
+
|
|
22
|
+
This class defines a pattern for matching files, including a regular expression
|
|
23
|
+
for matching filenames and a subdirectory path to search in.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
regex: Regular expression pattern for matching filenames
|
|
27
|
+
sub_path: Path to search for matching files (default: "./")
|
|
28
|
+
|
|
29
|
+
Attributes:
|
|
30
|
+
regex: Regular expression pattern
|
|
31
|
+
sub_path: Path to search in
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
regex: str | None = None
|
|
35
|
+
sub_path: None | pathlib.Path = dataclasses.field(
|
|
36
|
+
default_factory=lambda: pathlib.Path("./")
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
def __post_init__(self):
|
|
40
|
+
"""Initialize and validate the file pattern.
|
|
41
|
+
|
|
42
|
+
Ensures that sub_path is a Path object and is not None.
|
|
43
|
+
"""
|
|
44
|
+
if not isinstance(self.sub_path, pathlib.Path):
|
|
45
|
+
self.sub_path = pathlib.Path(self.sub_path)
|
|
46
|
+
assert self.sub_path is not None
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@dataclasses.dataclass
|
|
50
|
+
class Patterns(BaseDataclass):
|
|
51
|
+
"""Collection of named file patterns.
|
|
52
|
+
|
|
53
|
+
This class manages a collection of file patterns, each associated with a name.
|
|
54
|
+
It provides a way to organize and access multiple file patterns.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
patterns: Dictionary mapping names to FilePattern instances
|
|
58
|
+
|
|
59
|
+
Attributes:
|
|
60
|
+
patterns: Dictionary of named file patterns
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
patterns: dict[str, FilePattern] = dataclasses.field(default_factory=dict)
|
graflo/util/transform.py
ADDED
|
@@ -0,0 +1,406 @@
|
|
|
1
|
+
"""Data transformation utilities for graph operations.
|
|
2
|
+
|
|
3
|
+
This module provides utility functions for transforming and standardizing data
|
|
4
|
+
in various formats, particularly for graph database operations. It includes
|
|
5
|
+
functions for date parsing, string standardization, and data cleaning.
|
|
6
|
+
|
|
7
|
+
Key Functions:
|
|
8
|
+
- standardize: Standardize string keys and names
|
|
9
|
+
- parse_date_*: Various date parsing functions for different formats
|
|
10
|
+
- cast_ibes_analyst: Parse and standardize analyst names
|
|
11
|
+
- clear_first_level_nones: Clean dictionaries by removing None values
|
|
12
|
+
- parse_multi_item: Parse complex multi-item strings
|
|
13
|
+
- pick_unique_dict: Remove duplicate dictionaries
|
|
14
|
+
|
|
15
|
+
Example:
|
|
16
|
+
>>> name = standardize("John. Doe, Smith")
|
|
17
|
+
>>> date = parse_date_standard("2023-01-01")
|
|
18
|
+
>>> analyst = cast_ibes_analyst("ADKINS/NARRA")
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
import json
|
|
22
|
+
import logging
|
|
23
|
+
import re
|
|
24
|
+
import time
|
|
25
|
+
from collections import defaultdict
|
|
26
|
+
from datetime import datetime
|
|
27
|
+
|
|
28
|
+
ORDINAL_SUFFIX = ["st", "nd", "rd", "th"]
|
|
29
|
+
|
|
30
|
+
logger = logging.getLogger(__name__)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def standardize(k):
|
|
34
|
+
"""Standardizes a string key by removing periods and splitting.
|
|
35
|
+
|
|
36
|
+
Handles comma and space-separated strings, normalizing their format.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
k (str): Input string to be standardized.
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
str: Cleaned and standardized string.
|
|
43
|
+
|
|
44
|
+
Example:
|
|
45
|
+
>>> standardize("John. Doe, Smith")
|
|
46
|
+
'John,Doe,Smith'
|
|
47
|
+
>>> standardize("John Doe Smith")
|
|
48
|
+
'John,Doe,Smith'
|
|
49
|
+
"""
|
|
50
|
+
k = k.translate(str.maketrans({".": ""}))
|
|
51
|
+
# try to split by ", "
|
|
52
|
+
k = k.split(", ")
|
|
53
|
+
if len(k) < 2:
|
|
54
|
+
k = k[0].split(" ")
|
|
55
|
+
else:
|
|
56
|
+
k[1] = k[1].translate(str.maketrans({" ": ""}))
|
|
57
|
+
return ",".join(k)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def parse_date_standard(input_str):
|
|
61
|
+
"""Parse a date string in YYYY-MM-DD format.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
input_str (str): Date string in YYYY-MM-DD format.
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
tuple: (year, month, day) as integers.
|
|
68
|
+
|
|
69
|
+
Example:
|
|
70
|
+
>>> parse_date_standard("2023-01-01")
|
|
71
|
+
(2023, 1, 1)
|
|
72
|
+
"""
|
|
73
|
+
dt = datetime.strptime(input_str, "%Y-%m-%d")
|
|
74
|
+
return dt.year, dt.month, dt.day
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def parse_date_conf(input_str):
|
|
78
|
+
"""Parse a date string in YYYYMMDD format.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
input_str (str): Date string in YYYYMMDD format.
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
tuple: (year, month, day) as integers.
|
|
85
|
+
|
|
86
|
+
Example:
|
|
87
|
+
>>> parse_date_conf("20230101")
|
|
88
|
+
(2023, 1, 1)
|
|
89
|
+
"""
|
|
90
|
+
dt = datetime.strptime(input_str, "%Y%m%d")
|
|
91
|
+
return dt.year, dt.month, dt.day
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def parse_date_ibes(date0, time0):
|
|
95
|
+
"""Converts IBES date and time to ISO 8601 format datetime.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
date0 (str/int): Date in YYYYMMDD format.
|
|
99
|
+
time0 (str): Time in HH:MM:SS format.
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
str: Datetime in ISO 8601 format (YYYY-MM-DDTHH:MM:SSZ).
|
|
103
|
+
|
|
104
|
+
Example:
|
|
105
|
+
>>> parse_date_ibes(20160126, "9:35:52")
|
|
106
|
+
'2016-01-26T09:35:52Z'
|
|
107
|
+
"""
|
|
108
|
+
date0 = str(date0)
|
|
109
|
+
year, month, day = date0[:4], date0[4:6], date0[6:]
|
|
110
|
+
full_datetime = f"{year}-{month}-{day}T{time0}Z"
|
|
111
|
+
|
|
112
|
+
return full_datetime
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def parse_date_yahoo(date0):
|
|
116
|
+
"""Convert Yahoo Finance date to ISO 8601 format.
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
date0 (str): Date in YYYY-MM-DD format.
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
str: Datetime in ISO 8601 format with noon time.
|
|
123
|
+
|
|
124
|
+
Example:
|
|
125
|
+
>>> parse_date_yahoo("2023-01-01")
|
|
126
|
+
'2023-01-01T12:00:00Z'
|
|
127
|
+
"""
|
|
128
|
+
full_datetime = f"{date0}T12:00:00Z"
|
|
129
|
+
return full_datetime
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def round_str(x, **kwargs):
|
|
133
|
+
"""Round a string number to specified precision.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
x (str): String representation of a number.
|
|
137
|
+
**kwargs: Additional arguments for round() function.
|
|
138
|
+
|
|
139
|
+
Returns:
|
|
140
|
+
float: Rounded number.
|
|
141
|
+
|
|
142
|
+
Example:
|
|
143
|
+
>>> round_str("3.14159", ndigits=2)
|
|
144
|
+
3.14
|
|
145
|
+
"""
|
|
146
|
+
return round(float(x), **kwargs)
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def parse_date_standard_to_epoch(input_str):
|
|
150
|
+
"""Convert standard date string to Unix epoch timestamp.
|
|
151
|
+
|
|
152
|
+
Args:
|
|
153
|
+
input_str (str): Date string in YYYY-MM-DD format.
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
float: Unix epoch timestamp.
|
|
157
|
+
|
|
158
|
+
Example:
|
|
159
|
+
>>> parse_date_standard_to_epoch("2023-01-01")
|
|
160
|
+
1672531200.0
|
|
161
|
+
"""
|
|
162
|
+
dt = datetime.strptime(input_str, "%Y-%m-%d").timetuple()
|
|
163
|
+
timestamp = time.mktime(dt)
|
|
164
|
+
return timestamp
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def cast_ibes_analyst(s):
|
|
168
|
+
"""Splits and normalizes analyst name strings.
|
|
169
|
+
|
|
170
|
+
Handles various name formats like 'ADKINS/NARRA' or 'ARFSTROM J'.
|
|
171
|
+
|
|
172
|
+
Args:
|
|
173
|
+
s (str): Analyst name string.
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
tuple: (last_name, first_initial)
|
|
177
|
+
|
|
178
|
+
Examples:
|
|
179
|
+
>>> cast_ibes_analyst('ADKINS/NARRA')
|
|
180
|
+
('ADKINS', 'N')
|
|
181
|
+
>>> cast_ibes_analyst('ARFSTROM J')
|
|
182
|
+
('ARFSTROM', 'J')
|
|
183
|
+
"""
|
|
184
|
+
if " " in s or "\t" in s:
|
|
185
|
+
r = s.split()[:2]
|
|
186
|
+
if len(r) < 2:
|
|
187
|
+
return r[0], ""
|
|
188
|
+
else:
|
|
189
|
+
return r[0], r[1][:1]
|
|
190
|
+
else:
|
|
191
|
+
r = s.split("/")
|
|
192
|
+
if s.startswith("/"):
|
|
193
|
+
r = r[1:3]
|
|
194
|
+
else:
|
|
195
|
+
r = r[:2]
|
|
196
|
+
if len(r) < 2:
|
|
197
|
+
return r[0], ""
|
|
198
|
+
else:
|
|
199
|
+
return r[0], r[1][:1]
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def parse_date_reference(input_str):
|
|
203
|
+
"""Extract year from a date reference string.
|
|
204
|
+
|
|
205
|
+
Args:
|
|
206
|
+
input_str (str): Date reference string.
|
|
207
|
+
|
|
208
|
+
Returns:
|
|
209
|
+
int: Year from the date reference.
|
|
210
|
+
|
|
211
|
+
Example:
|
|
212
|
+
>>> parse_date_reference("1923, May 10")
|
|
213
|
+
1923
|
|
214
|
+
"""
|
|
215
|
+
return _parse_date_reference(input_str)["year"]
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def _parse_date_reference(input_str):
|
|
219
|
+
"""Parse complex, human-written date references.
|
|
220
|
+
|
|
221
|
+
Handles various date formats like:
|
|
222
|
+
- "1923, May 10"
|
|
223
|
+
- "1923, July"
|
|
224
|
+
- "1921, Sept"
|
|
225
|
+
- "1935-36"
|
|
226
|
+
- "1926, December 24th"
|
|
227
|
+
|
|
228
|
+
Args:
|
|
229
|
+
input_str (str): Date string in various formats.
|
|
230
|
+
|
|
231
|
+
Returns:
|
|
232
|
+
dict: Parsed date information with keys 'year', optional 'month', 'day'.
|
|
233
|
+
|
|
234
|
+
Example:
|
|
235
|
+
>>> _parse_date_reference("1923, May 10")
|
|
236
|
+
{'year': 1923, 'month': 5, 'day': 10}
|
|
237
|
+
"""
|
|
238
|
+
if "," in input_str:
|
|
239
|
+
if len(input_str.split(" ")) == 3:
|
|
240
|
+
if input_str[-2:] in ORDINAL_SUFFIX:
|
|
241
|
+
input_str = input_str[:-2]
|
|
242
|
+
try:
|
|
243
|
+
dt = datetime.strptime(input_str, "%Y, %B %d")
|
|
244
|
+
return {"year": dt.year, "month": dt.month, "day": dt.day}
|
|
245
|
+
except:
|
|
246
|
+
try:
|
|
247
|
+
aux = input_str.split(" ")
|
|
248
|
+
input_str = " ".join([aux[0]] + [aux[1][:3]] + [aux[2]])
|
|
249
|
+
dt = datetime.strptime(input_str, "%Y, %b %d")
|
|
250
|
+
return {"year": dt.year, "month": dt.month, "day": dt.day}
|
|
251
|
+
except:
|
|
252
|
+
return {"year": input_str}
|
|
253
|
+
else:
|
|
254
|
+
try:
|
|
255
|
+
dt = datetime.strptime(input_str, "%Y, %B")
|
|
256
|
+
return {"year": dt.year, "month": dt.month}
|
|
257
|
+
except:
|
|
258
|
+
try:
|
|
259
|
+
aux = input_str.split(" ")
|
|
260
|
+
input_str = " ".join([aux[0]] + [aux[1][:3]])
|
|
261
|
+
dt = datetime.strptime(input_str, "%Y, %b")
|
|
262
|
+
return {"year": dt.year, "month": dt.month}
|
|
263
|
+
except:
|
|
264
|
+
return {"year": input_str}
|
|
265
|
+
else:
|
|
266
|
+
try:
|
|
267
|
+
dt = datetime.strptime(input_str[:4], "%Y")
|
|
268
|
+
return {"year": dt.year}
|
|
269
|
+
except:
|
|
270
|
+
return {"year": input_str}
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def try_int(x):
|
|
274
|
+
"""Attempt to convert a value to integer.
|
|
275
|
+
|
|
276
|
+
Args:
|
|
277
|
+
x: Value to convert.
|
|
278
|
+
|
|
279
|
+
Returns:
|
|
280
|
+
int or original value: Integer if conversion successful, original value otherwise.
|
|
281
|
+
|
|
282
|
+
Example:
|
|
283
|
+
>>> try_int("123")
|
|
284
|
+
123
|
|
285
|
+
>>> try_int("abc")
|
|
286
|
+
'abc'
|
|
287
|
+
"""
|
|
288
|
+
try:
|
|
289
|
+
x = int(x)
|
|
290
|
+
return x
|
|
291
|
+
except:
|
|
292
|
+
return x
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
def clear_first_level_nones(docs, keys_keep_nones=None):
|
|
296
|
+
"""Removes None values from dictionaries, with optional key exceptions.
|
|
297
|
+
|
|
298
|
+
Args:
|
|
299
|
+
docs (list): List of dictionaries to clean.
|
|
300
|
+
keys_keep_nones (list, optional): Keys to keep even if their value is None.
|
|
301
|
+
|
|
302
|
+
Returns:
|
|
303
|
+
list: Cleaned list of dictionaries.
|
|
304
|
+
|
|
305
|
+
Example:
|
|
306
|
+
>>> docs = [{"a": 1, "b": None}, {"a": None, "b": 2}]
|
|
307
|
+
>>> clear_first_level_nones(docs, keys_keep_nones=["a"])
|
|
308
|
+
[{"a": 1}, {"a": None, "b": 2}]
|
|
309
|
+
"""
|
|
310
|
+
docs = [
|
|
311
|
+
{k: v for k, v in tdict.items() if v or k in keys_keep_nones} for tdict in docs
|
|
312
|
+
]
|
|
313
|
+
return docs
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
def parse_multi_item(s, mapper: dict, direct: list):
|
|
317
|
+
"""Parses complex multi-item strings into structured data.
|
|
318
|
+
|
|
319
|
+
Supports parsing strings with quoted or bracketed items.
|
|
320
|
+
|
|
321
|
+
Args:
|
|
322
|
+
s (str): Input string to parse.
|
|
323
|
+
mapper (dict): Mapping of input keys to output keys.
|
|
324
|
+
direct (list): Direct keys to extract.
|
|
325
|
+
|
|
326
|
+
Returns:
|
|
327
|
+
defaultdict: Parsed items with lists as values.
|
|
328
|
+
|
|
329
|
+
Example:
|
|
330
|
+
>>> s = '[name: John, age: 30] [name: Jane, age: 25]'
|
|
331
|
+
>>> mapper = {"name": "full_name"}
|
|
332
|
+
>>> direct = ["age"]
|
|
333
|
+
>>> parse_multi_item(s, mapper, direct)
|
|
334
|
+
defaultdict(list, {'full_name': ['John', 'Jane'], 'age': ['30', '25']})
|
|
335
|
+
"""
|
|
336
|
+
if "'" in s:
|
|
337
|
+
items_str = re.findall(r"\"(.*?)\"", s) + re.findall(r"\'(.*?)\'", s)
|
|
338
|
+
else:
|
|
339
|
+
# remove brackets
|
|
340
|
+
items_str = re.findall(r"\[([^]]+)", s)[0].split()
|
|
341
|
+
r: defaultdict[str, list] = defaultdict(list)
|
|
342
|
+
for item in items_str:
|
|
343
|
+
doc0 = [ss.strip().split(":") for ss in item.split(",")]
|
|
344
|
+
if all([len(x) == 2 for x in doc0]):
|
|
345
|
+
doc0_dict = dict(doc0)
|
|
346
|
+
for n_init, n_final in mapper.items():
|
|
347
|
+
try:
|
|
348
|
+
r[n_final] += [doc0_dict[n_init]]
|
|
349
|
+
except KeyError:
|
|
350
|
+
r[n_final] += [None]
|
|
351
|
+
|
|
352
|
+
for n_final in direct:
|
|
353
|
+
try:
|
|
354
|
+
r[n_final] += [doc0_dict[n_final]]
|
|
355
|
+
except KeyError:
|
|
356
|
+
r[n_final] += [None]
|
|
357
|
+
else:
|
|
358
|
+
for key, value in zip(direct, doc0):
|
|
359
|
+
r[key] += [value]
|
|
360
|
+
|
|
361
|
+
return r
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
def pick_unique_dict(docs):
|
|
365
|
+
"""Removes duplicate dictionaries from a list.
|
|
366
|
+
|
|
367
|
+
Uses JSON serialization to identify unique dictionaries.
|
|
368
|
+
|
|
369
|
+
Args:
|
|
370
|
+
docs (list): List of dictionaries.
|
|
371
|
+
|
|
372
|
+
Returns:
|
|
373
|
+
list: List of unique dictionaries.
|
|
374
|
+
|
|
375
|
+
Example:
|
|
376
|
+
>>> docs = [{"a": 1}, {"a": 1}, {"b": 2}]
|
|
377
|
+
>>> pick_unique_dict(docs)
|
|
378
|
+
[{"a": 1}, {"b": 2}]
|
|
379
|
+
"""
|
|
380
|
+
docs = {json.dumps(d, sort_keys=True) for d in docs}
|
|
381
|
+
docs = [json.loads(t) for t in docs]
|
|
382
|
+
return docs
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
def split_keep_part(s: str, sep="/", keep=-1) -> str:
|
|
386
|
+
"""Split a string and keep specified parts.
|
|
387
|
+
|
|
388
|
+
Args:
|
|
389
|
+
s (str): String to split.
|
|
390
|
+
sep (str): Separator to split on.
|
|
391
|
+
keep (int or list): Index or indices to keep.
|
|
392
|
+
|
|
393
|
+
Returns:
|
|
394
|
+
str: Joined string of kept parts.
|
|
395
|
+
|
|
396
|
+
Example:
|
|
397
|
+
>>> split_keep_part("a/b/c", keep=0)
|
|
398
|
+
'a'
|
|
399
|
+
>>> split_keep_part("a/b/c", keep=[0, 2])
|
|
400
|
+
'a/c'
|
|
401
|
+
"""
|
|
402
|
+
if isinstance(keep, list):
|
|
403
|
+
items = s.split(sep)
|
|
404
|
+
return sep.join(items[k] for k in keep)
|
|
405
|
+
else:
|
|
406
|
+
return s.split(sep)[keep]
|