linkml-store 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. linkml_store/__init__.py +7 -0
  2. linkml_store/api/__init__.py +8 -0
  3. linkml_store/api/client.py +414 -0
  4. linkml_store/api/collection.py +1280 -0
  5. linkml_store/api/config.py +187 -0
  6. linkml_store/api/database.py +862 -0
  7. linkml_store/api/queries.py +69 -0
  8. linkml_store/api/stores/__init__.py +0 -0
  9. linkml_store/api/stores/chromadb/__init__.py +7 -0
  10. linkml_store/api/stores/chromadb/chromadb_collection.py +121 -0
  11. linkml_store/api/stores/chromadb/chromadb_database.py +89 -0
  12. linkml_store/api/stores/dremio/__init__.py +10 -0
  13. linkml_store/api/stores/dremio/dremio_collection.py +555 -0
  14. linkml_store/api/stores/dremio/dremio_database.py +1052 -0
  15. linkml_store/api/stores/dremio/mappings.py +105 -0
  16. linkml_store/api/stores/dremio_rest/__init__.py +11 -0
  17. linkml_store/api/stores/dremio_rest/dremio_rest_collection.py +502 -0
  18. linkml_store/api/stores/dremio_rest/dremio_rest_database.py +1023 -0
  19. linkml_store/api/stores/duckdb/__init__.py +16 -0
  20. linkml_store/api/stores/duckdb/duckdb_collection.py +339 -0
  21. linkml_store/api/stores/duckdb/duckdb_database.py +283 -0
  22. linkml_store/api/stores/duckdb/mappings.py +8 -0
  23. linkml_store/api/stores/filesystem/__init__.py +15 -0
  24. linkml_store/api/stores/filesystem/filesystem_collection.py +186 -0
  25. linkml_store/api/stores/filesystem/filesystem_database.py +81 -0
  26. linkml_store/api/stores/hdf5/__init__.py +7 -0
  27. linkml_store/api/stores/hdf5/hdf5_collection.py +104 -0
  28. linkml_store/api/stores/hdf5/hdf5_database.py +79 -0
  29. linkml_store/api/stores/ibis/__init__.py +5 -0
  30. linkml_store/api/stores/ibis/ibis_collection.py +488 -0
  31. linkml_store/api/stores/ibis/ibis_database.py +328 -0
  32. linkml_store/api/stores/mongodb/__init__.py +25 -0
  33. linkml_store/api/stores/mongodb/mongodb_collection.py +379 -0
  34. linkml_store/api/stores/mongodb/mongodb_database.py +114 -0
  35. linkml_store/api/stores/neo4j/__init__.py +0 -0
  36. linkml_store/api/stores/neo4j/neo4j_collection.py +429 -0
  37. linkml_store/api/stores/neo4j/neo4j_database.py +154 -0
  38. linkml_store/api/stores/solr/__init__.py +3 -0
  39. linkml_store/api/stores/solr/solr_collection.py +224 -0
  40. linkml_store/api/stores/solr/solr_database.py +83 -0
  41. linkml_store/api/stores/solr/solr_utils.py +0 -0
  42. linkml_store/api/types.py +4 -0
  43. linkml_store/cli.py +1147 -0
  44. linkml_store/constants.py +7 -0
  45. linkml_store/graphs/__init__.py +0 -0
  46. linkml_store/graphs/graph_map.py +24 -0
  47. linkml_store/index/__init__.py +53 -0
  48. linkml_store/index/implementations/__init__.py +0 -0
  49. linkml_store/index/implementations/llm_indexer.py +174 -0
  50. linkml_store/index/implementations/simple_indexer.py +43 -0
  51. linkml_store/index/indexer.py +211 -0
  52. linkml_store/inference/__init__.py +13 -0
  53. linkml_store/inference/evaluation.py +195 -0
  54. linkml_store/inference/implementations/__init__.py +0 -0
  55. linkml_store/inference/implementations/llm_inference_engine.py +154 -0
  56. linkml_store/inference/implementations/rag_inference_engine.py +276 -0
  57. linkml_store/inference/implementations/rule_based_inference_engine.py +169 -0
  58. linkml_store/inference/implementations/sklearn_inference_engine.py +314 -0
  59. linkml_store/inference/inference_config.py +66 -0
  60. linkml_store/inference/inference_engine.py +209 -0
  61. linkml_store/inference/inference_engine_registry.py +74 -0
  62. linkml_store/plotting/__init__.py +5 -0
  63. linkml_store/plotting/cli.py +826 -0
  64. linkml_store/plotting/dimensionality_reduction.py +453 -0
  65. linkml_store/plotting/embedding_plot.py +489 -0
  66. linkml_store/plotting/facet_chart.py +73 -0
  67. linkml_store/plotting/heatmap.py +383 -0
  68. linkml_store/utils/__init__.py +0 -0
  69. linkml_store/utils/change_utils.py +17 -0
  70. linkml_store/utils/dat_parser.py +95 -0
  71. linkml_store/utils/embedding_matcher.py +424 -0
  72. linkml_store/utils/embedding_utils.py +299 -0
  73. linkml_store/utils/enrichment_analyzer.py +217 -0
  74. linkml_store/utils/file_utils.py +37 -0
  75. linkml_store/utils/format_utils.py +550 -0
  76. linkml_store/utils/io.py +38 -0
  77. linkml_store/utils/llm_utils.py +122 -0
  78. linkml_store/utils/mongodb_utils.py +145 -0
  79. linkml_store/utils/neo4j_utils.py +42 -0
  80. linkml_store/utils/object_utils.py +190 -0
  81. linkml_store/utils/pandas_utils.py +93 -0
  82. linkml_store/utils/patch_utils.py +126 -0
  83. linkml_store/utils/query_utils.py +89 -0
  84. linkml_store/utils/schema_utils.py +23 -0
  85. linkml_store/utils/sklearn_utils.py +193 -0
  86. linkml_store/utils/sql_utils.py +177 -0
  87. linkml_store/utils/stats_utils.py +53 -0
  88. linkml_store/utils/vector_utils.py +158 -0
  89. linkml_store/webapi/__init__.py +0 -0
  90. linkml_store/webapi/html/__init__.py +3 -0
  91. linkml_store/webapi/html/base.html.j2 +24 -0
  92. linkml_store/webapi/html/collection_details.html.j2 +15 -0
  93. linkml_store/webapi/html/database_details.html.j2 +16 -0
  94. linkml_store/webapi/html/databases.html.j2 +14 -0
  95. linkml_store/webapi/html/generic.html.j2 +43 -0
  96. linkml_store/webapi/main.py +855 -0
  97. linkml_store-0.3.0.dist-info/METADATA +226 -0
  98. linkml_store-0.3.0.dist-info/RECORD +101 -0
  99. linkml_store-0.3.0.dist-info/WHEEL +4 -0
  100. linkml_store-0.3.0.dist-info/entry_points.txt +3 -0
  101. linkml_store-0.3.0.dist-info/licenses/LICENSE +22 -0
@@ -0,0 +1,383 @@
1
+ """
2
+ Heatmap visualization module for LinkML data.
3
+
4
+ This module provides functions to generate heatmaps from pandas DataFrames or tabular data files.
5
+ """
6
+
7
+ import logging
8
+ import os
9
+ from pathlib import Path
10
+ from typing import Any, Dict, List, Literal, Optional, Tuple, Union
11
+
12
+ import matplotlib.pyplot as plt
13
+ import numpy as np
14
+ import pandas as pd
15
+ import seaborn as sns
16
+ from matplotlib.colors import LinearSegmentedColormap
17
+ from scipy.cluster import hierarchy
18
+ from scipy.spatial import distance
19
+
20
+ from linkml_store.utils.format_utils import Format, load_objects, write_output
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ def create_heatmap(
26
+ data: pd.DataFrame,
27
+ x_column: str,
28
+ y_column: str,
29
+ value_column: Optional[str] = None,
30
+ minimum_value: Optional[float] = None,
31
+ title: Optional[str] = None,
32
+ figsize: Tuple[int, int] = (10, 8),
33
+ cmap: Union[str, LinearSegmentedColormap] = "YlGnBu",
34
+ annot: bool = True,
35
+ fmt: Optional[str] = None, # Dynamically determined based on data
36
+ linewidths: float = 0.5,
37
+ linecolor: str = "white",
38
+ square: bool = False,
39
+ output_file: Optional[str] = None,
40
+ dpi: int = 300,
41
+ missing_value: Any = np.nan,
42
+ vmin: Optional[float] = None,
43
+ vmax: Optional[float] = None,
44
+ robust: bool = False,
45
+ remove_duplicates: bool = False,
46
+ font_size: int = 10,
47
+ cluster: Union[bool, Literal["both", "x", "y"]] = False,
48
+ cluster_method: str = "complete", # linkage method: complete, average, single, etc.
49
+ cluster_metric: str = "euclidean", # distance metric: euclidean, cosine, etc.
50
+ **kwargs,
51
+ ) -> Tuple[plt.Figure, plt.Axes]:
52
+ """
53
+ Create a heatmap from a pandas DataFrame.
54
+
55
+ Args:
56
+ data: Input DataFrame containing the data to plot
57
+ x_column: Column to use for x-axis categories
58
+ y_column: Column to use for y-axis categories
59
+ value_column: Column containing values for the heatmap. If None, frequency counts will be used.
60
+ minimum_value: Minimum value to include in the heatmap
61
+ title: Title for the heatmap
62
+ figsize: Figure size as (width, height) in inches
63
+ cmap: Colormap for the heatmap
64
+ annot: Whether to annotate cells with values
65
+ fmt: String formatting code for annotations (auto-detected if None)
66
+ linewidths: Width of lines between cells
67
+ linecolor: Color of lines between cells
68
+ square: Whether to make cells square
69
+ output_file: File path to save the figure (optional)
70
+ dpi: Resolution for saved figure
71
+ missing_value: Value to use for missing data (defaults to NaN)
72
+ vmin: Minimum value for colormap scaling
73
+ vmax: Maximum value for colormap scaling
74
+ robust: If True, compute colormap limits using robust quantiles instead of min/max
75
+ remove_duplicates: If True, removes duplicate rows before creating the heatmap
76
+ font_size: Font size for annotations
77
+ cluster: Whether and which axes to cluster:
78
+ - False: No clustering (default)
79
+ - True or "both": Cluster both x and y axes
80
+ - "x": Cluster only x-axis
81
+ - "y": Cluster only y-axis
82
+ cluster_method: Linkage method for hierarchical clustering
83
+ (e.g., "single", "complete", "average", "ward")
84
+ cluster_metric: Distance metric for clustering (e.g., "euclidean", "correlation", "cosine")
85
+ **kwargs: Additional keyword arguments to pass to seaborn's heatmap function
86
+
87
+ Returns:
88
+ Tuple containing the figure and axes objects
89
+ """
90
+ # Validate input
91
+ if x_column not in data.columns:
92
+ raise ValueError(f"x_column '{x_column}' not found in DataFrame columns: {list(data.columns)}")
93
+ if y_column not in data.columns:
94
+ raise ValueError(f"y_column '{y_column}' not found in DataFrame columns: {list(data.columns)}")
95
+ if value_column and value_column not in data.columns:
96
+ raise ValueError(f"value_column '{value_column}' not found in DataFrame columns: {list(data.columns)}")
97
+
98
+
99
+ cols = [x_column, y_column]
100
+ if value_column:
101
+ cols.append(value_column)
102
+
103
+ # select cols from data
104
+ data = data[cols]
105
+
106
+ if any(isinstance(val, (list, set, tuple)) for val in data[x_column].dropna().head(100)):
107
+ logger.info(f"Exploding list values in x_column '{x_column}'")
108
+ data = data.explode(x_column).dropna(subset=[x_column])
109
+
110
+ if any(isinstance(val, (list, set, tuple)) for val in data[y_column].dropna().head(100)):
111
+ logger.info(f"Exploding list values in y_column '{y_column}'")
112
+ data = data.explode(y_column).dropna(subset=[y_column])
113
+
114
+ if value_column:
115
+ if any(isinstance(val, (list, set, tuple)) for val in data[value_column].dropna().head(100)):
116
+ logger.info(f"Exploding list values in value_column '{value_column}'")
117
+ data = data.explode(value_column).dropna(subset=[value_column])
118
+
119
+ # Remove duplicates by default (assume they're accidents unless user overrides)
120
+ if remove_duplicates:
121
+ data = data.drop_duplicates()
122
+
123
+ if value_column and minimum_value is not None:
124
+ data = data[data[value_column] >= minimum_value]
125
+
126
+ # Prepare the data
127
+ if value_column:
128
+ # Use the provided value column
129
+ # print(data.head())
130
+ pivot_data = data.pivot_table(
131
+ index=y_column,
132
+ columns=x_column,
133
+ values=value_column,
134
+ aggfunc='mean',
135
+ fill_value=missing_value
136
+ )
137
+ else:
138
+ # Use frequency counts
139
+ cross_tab = pd.crosstab(data[y_column], data[x_column])
140
+ pivot_data = cross_tab
141
+
142
+ # Auto-detect format string if not provided
143
+ if fmt is None:
144
+ # Check if the pivot table contains integers only
145
+ if pivot_data.dtypes.apply(lambda x: pd.api.types.is_integer_dtype(x)).all():
146
+ fmt = 'd' # Integer format
147
+ else:
148
+ fmt = '.1f' # One decimal place for floats
149
+
150
+ # Make sure all cells have a reasonable minimum size
151
+ min_height = max(4, 80 / len(pivot_data.index) if len(pivot_data.index) > 0 else 10)
152
+ min_width = max(4, 80 / len(pivot_data.columns) if len(pivot_data.columns) > 0 else 10)
153
+
154
+ # Adjust figure size based on the number of rows and columns
155
+ adjusted_height = max(figsize[1], min_height * len(pivot_data.index) / 10)
156
+ adjusted_width = max(figsize[0], min_width * len(pivot_data.columns) / 10)
157
+ adjusted_figsize = (adjusted_width, adjusted_height)
158
+
159
+ # Create figure and axes
160
+ fig, ax = plt.subplots(figsize=adjusted_figsize)
161
+
162
+ # Apply clustering if requested
163
+ row_linkage = None
164
+ col_linkage = None
165
+
166
+ if cluster:
167
+ cluster_axes = cluster
168
+ if cluster_axes is True:
169
+ cluster_axes = "both"
170
+
171
+ # Fill NAs for clustering
172
+ pivot_data_for_clustering = pivot_data.fillna(0)
173
+
174
+ # Cluster rows (y-axis)
175
+ if cluster_axes in ["both", "y"]:
176
+ try:
177
+ # Calculate distance matrix and linkage for rows
178
+ row_distances = distance.pdist(pivot_data_for_clustering.values, metric=cluster_metric)
179
+ row_linkage = hierarchy.linkage(row_distances, method=cluster_method)
180
+
181
+ # Reorder rows based on clustering
182
+ row_dendrogram = hierarchy.dendrogram(row_linkage, no_plot=True)
183
+ row_order = row_dendrogram['leaves']
184
+ pivot_data = pivot_data.iloc[row_order]
185
+
186
+ logger.info(f"Applied clustering to rows using {cluster_method} linkage and {cluster_metric} metric")
187
+ except Exception as e:
188
+ logger.warning(f"Failed to cluster rows: {e}")
189
+
190
+ # Cluster columns (x-axis)
191
+ if cluster_axes in ["both", "x"]:
192
+ try:
193
+ # Calculate distance matrix and linkage for columns
194
+ col_distances = distance.pdist(pivot_data_for_clustering.values.T, metric=cluster_metric)
195
+ col_linkage = hierarchy.linkage(col_distances, method=cluster_method)
196
+
197
+ # Reorder columns based on clustering
198
+ col_dendrogram = hierarchy.dendrogram(col_linkage, no_plot=True)
199
+ col_order = col_dendrogram['leaves']
200
+ pivot_data = pivot_data.iloc[:, col_order]
201
+
202
+ logger.info(f"Applied clustering to columns using {cluster_method} linkage and {cluster_metric} metric")
203
+ except Exception as e:
204
+ logger.warning(f"Failed to cluster columns: {e}")
205
+
206
+ # Create the heatmap
207
+ sns.heatmap(
208
+ pivot_data,
209
+ cmap=cmap,
210
+ annot=annot,
211
+ fmt=fmt,
212
+ linewidths=linewidths,
213
+ linecolor=linecolor,
214
+ square=square,
215
+ vmin=vmin,
216
+ vmax=vmax,
217
+ robust=robust,
218
+ ax=ax,
219
+ annot_kws={'fontsize': font_size},
220
+ **kwargs
221
+ )
222
+
223
+ # Set title if provided
224
+ if title:
225
+ ax.set_title(title, fontsize=font_size + 4)
226
+
227
+ # Improve display of tick labels
228
+ plt.xticks(rotation=45, ha="right", fontsize=font_size)
229
+ plt.yticks(rotation=0, fontsize=font_size)
230
+
231
+ # Add grid lines to make the table more readable
232
+ ax.grid(False)
233
+
234
+ # Improve contrast for better readability
235
+ for _, spine in ax.spines.items():
236
+ spine.set_visible(True)
237
+ spine.set_color('black')
238
+ spine.set_linewidth(1)
239
+
240
+ # Adjust layout
241
+ plt.tight_layout()
242
+
243
+ # Save the figure if output file is specified
244
+ if output_file:
245
+ output_path = Path(output_file)
246
+ output_dir = output_path.parent
247
+ if not output_dir.exists():
248
+ output_dir.mkdir(parents=True, exist_ok=True)
249
+ plt.savefig(output_file, dpi=dpi, bbox_inches="tight")
250
+ logger.info(f"Heatmap saved to {output_file}")
251
+
252
+ return fig, ax
253
+
254
+
255
+ def heatmap_from_file(
256
+ file_path: str,
257
+ x_column: str,
258
+ y_column: str,
259
+ value_column: Optional[str] = None,
260
+ format: Optional[Union[Format, str]] = None,
261
+ compression: Optional[str] = None,
262
+ output_file: Optional[str] = None,
263
+ remove_duplicates: bool = True,
264
+ **kwargs,
265
+ ) -> Tuple[plt.Figure, plt.Axes]:
266
+ """
267
+ Create a heatmap from a file (CSV, TSV, etc.).
268
+
269
+ Args:
270
+ file_path: Path to the input file or "-" for stdin
271
+ x_column: Column to use for x-axis categories
272
+ y_column: Column to use for y-axis categories
273
+ value_column: Column containing values for the heatmap. If None, frequency counts will be used.
274
+ format: Format of the input file (auto-detected if None)
275
+ compression: Compression format ('gz' or 'tgz')
276
+ output_file: File path to save the figure (optional)
277
+ remove_duplicates: If True, removes duplicate rows before creating the heatmap
278
+ **kwargs: Additional arguments to pass to create_heatmap
279
+
280
+ Returns:
281
+ Tuple containing the figure and axes objects
282
+ """
283
+ # Handle stdin input safely
284
+ import sys
285
+ import io
286
+ import pandas as pd
287
+ import click
288
+
289
+ # Load the data
290
+ if file_path == "-":
291
+ # Read directly from stdin since format_utils will use sys.stdin which may already be consumed
292
+ if not format or str(format).lower() in ['csv', 'tsv']:
293
+ # Default to CSV if no format specified
294
+ delimiter = ',' if not format or str(format).lower() == 'csv' else '\t'
295
+ df = pd.read_csv(sys.stdin, delimiter=delimiter)
296
+ else:
297
+ # Try to use format_utils but with a backup plan
298
+ try:
299
+ objs = load_objects(file_path, format=format, compression=compression)
300
+ df = pd.DataFrame(objs)
301
+ except ValueError as e:
302
+ if "I/O operation on closed file" in str(e):
303
+ logger.warning("Could not read from stdin. It may have been consumed already.")
304
+ raise click.UsageError("Error reading from stdin. Please provide a file path or ensure stdin has data.")
305
+ else:
306
+ raise
307
+ else:
308
+ # For regular files, use format_utils as normal
309
+ if (not format or format in ["csv", "tsv"]) and not compression:
310
+ df = pd.read_csv(file_path)
311
+ else:
312
+ objs = load_objects(file_path, format=format, compression=compression)
313
+ df = pd.DataFrame(objs)
314
+
315
+ # Create the heatmap
316
+ return create_heatmap(
317
+ data=df,
318
+ x_column=x_column,
319
+ y_column=y_column,
320
+ value_column=value_column,
321
+ output_file=output_file,
322
+ remove_duplicates=remove_duplicates,
323
+ **kwargs
324
+ )
325
+
326
+
327
+ def export_heatmap_data(
328
+ data: pd.DataFrame,
329
+ x_column: str,
330
+ y_column: str,
331
+ value_column: Optional[str] = None,
332
+ output_file: Optional[str] = None,
333
+ format: Union[Format, str] = Format.CSV,
334
+ missing_value: Any = np.nan,
335
+ remove_duplicates: bool = True,
336
+ ) -> pd.DataFrame:
337
+ """
338
+ Export heatmap data to a file or return it as a DataFrame.
339
+
340
+ Args:
341
+ data: Input DataFrame containing the data
342
+ x_column: Column to use for x-axis categories
343
+ y_column: Column to use for y-axis categories
344
+ value_column: Column containing values for the heatmap. If None, frequency counts will be used.
345
+ output_file: File path to save the data (optional)
346
+ format: Output format for the file
347
+ missing_value: Value to use for missing data
348
+ remove_duplicates: If True, removes duplicate rows before creating the pivot table
349
+
350
+ Returns:
351
+ DataFrame containing the pivot table data
352
+ """
353
+ # Remove duplicates by default (assume they're accidents unless user overrides)
354
+ if remove_duplicates:
355
+ # Keep the first occurrence of each x_column, y_column combination
356
+ data = data.drop_duplicates(subset=[x_column, y_column])
357
+
358
+ # Prepare the data
359
+ if value_column:
360
+ # Use the provided value column
361
+ pivot_data = data.pivot_table(
362
+ index=y_column,
363
+ columns=x_column,
364
+ values=value_column,
365
+ aggfunc='mean',
366
+ fill_value=missing_value
367
+ )
368
+ else:
369
+ # Use frequency counts
370
+ cross_tab = pd.crosstab(data[y_column], data[x_column])
371
+ pivot_data = cross_tab
372
+
373
+ # Reset index to make the y_column a regular column
374
+ result_df = pivot_data.reset_index()
375
+
376
+ # Write to file if output_file is provided
377
+ if output_file:
378
+ # Convert to records format for writing
379
+ records = result_df.to_dict(orient='records')
380
+ write_output(records, format=format, target=output_file)
381
+ logger.info(f"Heatmap data saved to {output_file}")
382
+
383
+ return result_df
File without changes
@@ -0,0 +1,17 @@
1
+ from typing import List
2
+
3
+ from linkml_store.api.collection import OBJECT
4
+
5
+
6
+ def insert_operation_to_patches(objs: List[OBJECT], **kwargs):
7
+ """
8
+ Translate a list of objects to a list of patches for insertion.
9
+
10
+ Note: inserts are always treated as being at the start of a list
11
+
12
+ :param objs: objects to insert
13
+ :param kwargs: additional arguments
14
+ """
15
+ patches = []
16
+ for obj in objs:
17
+ patches.append({"op": "add", "path": "/0", "value": obj})
@@ -0,0 +1,95 @@
1
+ from typing import Any, Dict, List, Optional, Tuple
2
+
3
+ ENTRY = Dict[str, Any]
4
+
5
+
6
+ def parse_sib_format(text) -> Tuple[Optional[ENTRY], List[ENTRY]]:
7
+ """
8
+ Parse SIB/Swiss-Prot format data into a structured dictionary.
9
+
10
+ Args:
11
+ text (str): The text in SIB/Swiss-Prot format
12
+
13
+ Returns:
14
+ dict: A dictionary with entry IDs as keys and parsed data as values
15
+ """
16
+ # Split the text into entries (separated by //)
17
+ entries = text.split("//\n")
18
+ header = None
19
+
20
+ # Initialize results dictionary
21
+ results = []
22
+
23
+ # Parse each entry
24
+ for entry in entries:
25
+ if not entry.strip():
26
+ continue
27
+
28
+ # Initialize dictionary for current entry
29
+ current_entry = {}
30
+ current_code = None
31
+
32
+ # Process each line
33
+ for line in entry.strip().split("\n"):
34
+ if not line.strip():
35
+ continue
36
+
37
+ # Check if this is a new field (starts with a 2-letter code followed by space)
38
+ if len(line) > 2 and line[2] == " ":
39
+ current_code = line[0:2]
40
+ # Remove the code and the following space(s)
41
+ value = line[3:].strip()
42
+
43
+ # Initialize as list if needed for multi-line fields
44
+ if current_code not in current_entry:
45
+ current_entry[current_code] = []
46
+
47
+ current_entry[current_code].append(value)
48
+
49
+ # Continuation of previous field
50
+ elif current_code is not None:
51
+ # Handle continuation lines (typically indented)
52
+ if current_code == "CC":
53
+ # For comments, preserve the indentation
54
+ current_entry[current_code].append(line)
55
+ else:
56
+ # For other fields, strip and append
57
+ current_entry[current_code].append(line.strip())
58
+
59
+ # Combine multiline comments; e.g
60
+ # -!- ...
61
+ # ...
62
+ # -!- ...
63
+ ccs = current_entry.get("CC", [])
64
+ new_ccs = []
65
+ for cc in ccs:
66
+ if not cc.startswith("-!-") and new_ccs:
67
+ new_ccs[-1] += " " + cc
68
+ else:
69
+ new_ccs.append(cc)
70
+ current_entry["CC"] = new_ccs
71
+ for k, vs in current_entry.items():
72
+ if k != "CC":
73
+ combined = "".join(vs)
74
+ combined = combined.strip()
75
+ if combined.endswith("."):
76
+ combined = combined.split(".")
77
+ combined = [c.strip() for c in combined if c.strip()]
78
+ if k == "DE":
79
+ combined = combined[0]
80
+ current_entry[k] = combined
81
+
82
+ if "ID" in current_entry:
83
+ results.append(current_entry)
84
+ else:
85
+ header = current_entry
86
+
87
+ return header, results
88
+
89
+
90
+ # Example usage:
91
+ # data = parse_sib_format(text)
92
+ # for entry_id, entry_data in data.items():
93
+ # print(f"Entry: {entry_id}")
94
+ # for code, values in entry_data.items():
95
+ # print(f" {code}: {values}")