linkml-store 0.1.10__py3-none-any.whl → 0.1.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of linkml-store might be problematic. Click here for more details.

linkml_store/cli.py CHANGED
@@ -16,6 +16,7 @@ from linkml_store.index.implementations.simple_indexer import SimpleIndexer
16
16
  from linkml_store.index.indexer import Indexer
17
17
  from linkml_store.utils.format_utils import Format, guess_format, load_objects, render_output, write_output
18
18
  from linkml_store.utils.object_utils import object_path_update
19
+ from linkml_store.utils.pandas_utils import facet_summary_to_dataframe_unmelted
19
20
 
20
21
  index_type_option = click.option(
21
22
  "--index-type",
@@ -87,6 +88,7 @@ include_internal_option = click.option("--include-internal/--no-include-internal
87
88
  @click.option("--set", help="Metadata settings in the form PATHEXPR=value", multiple=True)
88
89
  @click.option("-v", "--verbose", count=True)
89
90
  @click.option("-q", "--quiet/--no-quiet")
91
+ @click.option("--base-dir", "-B", help="Base directory for the client configuration")
90
92
  @click.option(
91
93
  "--stacktrace/--no-stacktrace",
92
94
  default=False,
@@ -94,7 +96,7 @@ include_internal_option = click.option("--include-internal/--no-include-internal
94
96
  help="If set then show full stacktrace on error",
95
97
  )
96
98
  @click.pass_context
97
- def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection, config, set):
99
+ def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection, config, set, **kwargs):
98
100
  """A CLI for interacting with the linkml-store."""
99
101
  if not stacktrace:
100
102
  sys.tracebacklimit = 0
@@ -117,7 +119,7 @@ def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection,
117
119
  if quiet:
118
120
  logger.setLevel(logging.ERROR)
119
121
  ctx.ensure_object(dict)
120
- client = Client().from_config(config) if config else Client()
122
+ client = Client().from_config(config, **kwargs) if config else Client()
121
123
  settings = ContextSettings(client=client, database_name=database, collection_name=collection)
122
124
  ctx.obj["settings"] = settings
123
125
  # DEPRECATED
@@ -150,7 +152,7 @@ def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection,
150
152
  # raise ValueError("Collection must be specified if there are multiple collections.")
151
153
  if settings.database and settings.database.list_collections():
152
154
  collection = settings.database.list_collections()[0]
153
- settings.collection_name = collection.name
155
+ settings.collection_name = collection.alias
154
156
 
155
157
 
156
158
  @cli.command()
@@ -180,15 +182,15 @@ def insert(ctx, files, object, format):
180
182
  objects = load_objects(file_path, format=format)
181
183
  else:
182
184
  objects = load_objects(file_path)
183
- logger.info(f"Inserting {len(objects)} objects from {file_path} into collection '{collection.name}'.")
185
+ logger.info(f"Inserting {len(objects)} objects from {file_path} into collection '{collection.alias}'.")
184
186
  collection.insert(objects)
185
- click.echo(f"Inserted {len(objects)} objects from {file_path} into collection '{collection.name}'.")
187
+ click.echo(f"Inserted {len(objects)} objects from {file_path} into collection '{collection.alias}'.")
186
188
  if object:
187
189
  for object_str in object:
188
190
  logger.info(f"Parsing: {object_str}")
189
191
  objects = yaml.safe_load(object_str)
190
192
  collection.insert(objects)
191
- click.echo(f"Inserted {len(objects)} objects from {object_str} into collection '{collection.name}'.")
193
+ click.echo(f"Inserted {len(objects)} objects from {object_str} into collection '{collection.alias}'.")
192
194
  collection.commit()
193
195
 
194
196
 
@@ -226,7 +228,11 @@ def store(ctx, files, object, format):
226
228
  @click.pass_context
227
229
  @click.argument("files", type=click.Path(exists=True), nargs=-1)
228
230
  def import_database(ctx, files, format):
229
- """Imports a database from a dump."""
231
+ """Imports a database from a dump.
232
+
233
+ See the `export` command for a full list of supported formats. The same
234
+ formats are generally supported for imports.
235
+ """
230
236
  settings = ctx.obj["settings"]
231
237
  db = settings.database
232
238
  if not files and not object:
@@ -240,7 +246,30 @@ def import_database(ctx, files, format):
240
246
  @click.option("--output", "-o", required=True, type=click.Path(), help="Output file path")
241
247
  @click.pass_context
242
248
  def export(ctx, output_type, output):
243
- """Exports a database to a dump."""
249
+ """Exports a database to a standard dump format.
250
+
251
+ Example:
252
+
253
+ linkml-store -d duckdb:///countries.db export -O yaml -o countries.yaml
254
+
255
+ Export format will be guessed from extension if not specified
256
+
257
+ Example:
258
+
259
+ linkml-store -d duckdb:///countries.db export -o countries.json
260
+
261
+ Tree formats such as YAML and JSON can natively store an entire database; each collection
262
+ will be a distinct key in the database.
263
+
264
+ Additionally, native dump formats can be used:
265
+
266
+ Example:
267
+
268
+ linkml-store -d duckdb:///countries.db export -o countries -O duckdb
269
+
270
+ Here, `countries` is a directory. This is equivalent to running EXPORT DATABASE
271
+ (see https://duckdb.org/docs/sql/statements/export.html)
272
+ """
244
273
  settings = ctx.obj["settings"]
245
274
  db = settings.database
246
275
  if output_type is None:
@@ -324,7 +353,7 @@ def query(ctx, where, limit, output_type, output):
324
353
  """
325
354
  collection = ctx.obj["settings"].collection
326
355
  where_clause = yaml.safe_load(where) if where else None
327
- query = Query(from_table=collection.name, where_clause=where_clause, limit=limit)
356
+ query = Query(from_table=collection.alias, where_clause=where_clause, limit=limit)
328
357
  result = collection.query(query)
329
358
  output_data = render_output(result.rows, output_type)
330
359
  if output:
@@ -341,7 +370,7 @@ def query(ctx, where, limit, output_type, output):
341
370
  def list_collections(ctx, **kwargs):
342
371
  db = ctx.obj["settings"].database
343
372
  for collection in db.list_collections(**kwargs):
344
- click.echo(collection.name)
373
+ click.echo(collection.alias)
345
374
  click.echo(render_output(collection.metadata))
346
375
 
347
376
 
@@ -351,8 +380,9 @@ def list_collections(ctx, **kwargs):
351
380
  @click.option("--output-type", "-O", type=format_choice, default="json", help="Output format")
352
381
  @click.option("--output", "-o", type=click.Path(), help="Output file path")
353
382
  @click.option("--columns", "-S", help="Columns to facet on")
383
+ @click.option("--wide/--no-wide", "-U/--no-U", default=False, show_default=True, help="Wide table")
354
384
  @click.pass_context
355
- def fq(ctx, where, limit, columns, output_type, output):
385
+ def fq(ctx, where, limit, columns, output_type, wide, output):
356
386
  """
357
387
  Query facets from the specified collection.
358
388
 
@@ -379,11 +409,22 @@ def fq(ctx, where, limit, columns, output_type, output):
379
409
  return "+".join([str(x) for x in key])
380
410
  return key
381
411
 
382
- count_dict = {}
383
- for key, value in results.items():
384
- value_as_dict = {_untuple(v[0:-1]): v[-1] for v in value}
385
- count_dict[_untuple(key)] = value_as_dict
386
- output_data = render_output(count_dict, output_type)
412
+ if wide:
413
+ results_obj = facet_summary_to_dataframe_unmelted(results)
414
+ else:
415
+ if output_type == Format.PYTHON.value:
416
+ results_obj = results
417
+ elif output_type in [Format.TSV.value, Format.CSV.value]:
418
+ results_obj = []
419
+ for fc, data in results.items():
420
+ for v, c in data:
421
+ results_obj.append({"facet": fc, "value": v, "count": c})
422
+ else:
423
+ results_obj = {}
424
+ for key, value in results.items():
425
+ value_as_dict = {_untuple(v[0:-1]): v[-1] for v in value}
426
+ results_obj[_untuple(key)] = value_as_dict
427
+ output_data = render_output(results_obj, output_type)
387
428
  if output:
388
429
  with open(output, "w") as f:
389
430
  f.write(output_data)
@@ -403,14 +444,17 @@ def _get_index(index_type=None, **kwargs) -> Indexer:
403
444
  @click.option("--where", "-w", type=click.STRING, help="WHERE clause for the query")
404
445
  @click.option("--output-type", "-O", type=format_choice, default=Format.FORMATTED.value, help="Output format")
405
446
  @click.option("--output", "-o", type=click.Path(), help="Output file path")
447
+ @click.option(
448
+ "--limit", "-l", default=-1, show_default=True, type=click.INT, help="Maximum number of results to return"
449
+ )
406
450
  @click.pass_context
407
- def describe(ctx, where, output_type, output):
451
+ def describe(ctx, where, output_type, output, limit):
408
452
  """
409
453
  Describe the collection schema.
410
454
  """
411
455
  where_clause = yaml.safe_load(where) if where else None
412
456
  collection = ctx.obj["settings"].collection
413
- df = collection.find(where_clause, limit=1).rows_dataframe
457
+ df = collection.find(where_clause, limit=limit).rows_dataframe
414
458
  write_output(df.describe(include="all").transpose(), output_type, target=output)
415
459
 
416
460
 
@@ -468,7 +512,7 @@ def search(ctx, search_term, where, limit, index_type, output_type, output, auto
468
512
  """Search objects in the specified collection."""
469
513
  collection = ctx.obj["settings"].collection
470
514
  ix = get_indexer(index_type)
471
- logger.info(f"Attaching index to collection {collection.name}: {ix.model_dump()}")
515
+ logger.info(f"Attaching index to collection {collection.alias}: {ix.model_dump()}")
472
516
  collection.attach_indexer(ix, auto_index=auto_index)
473
517
  result = collection.search(search_term, where=where, limit=limit)
474
518
  output_data = render_output([{"score": row[0], **row[1]} for row in result.ranked_rows], output_type)
@@ -498,6 +542,7 @@ def indexes(ctx):
498
542
  def validate(ctx, output_type, output):
499
543
  """Validate objects in the specified collection."""
500
544
  collection = ctx.obj["settings"].collection
545
+ logger.info(f"Validating collection {collection.alias}")
501
546
  validation_results = [json_dumper.to_dict(x) for x in collection.iter_validate_collection()]
502
547
  output_data = render_output(validation_results, output_type)
503
548
  if output:
@@ -1,3 +1,14 @@
1
+ """
2
+ Indexers package.
3
+
4
+ Indexers allow indexes to be added to existing :class:`Collection` objects.
5
+
6
+ Current two are supported:
7
+
8
+ * simple: :class:`SimpleIndexer`
9
+ * llm: :class:`LLMIndexer`
10
+ """
11
+
1
12
  from typing import Type
2
13
 
3
14
  from linkml_store.index.implementations.llm_indexer import LLMIndexer
@@ -14,7 +25,7 @@ def get_indexer_class(name: str) -> Type[Indexer]:
14
25
  """
15
26
  Get an indexer class by name.
16
27
 
17
- :param name: the name of the indexer
28
+ :param name: the name of the indexer (simple, llm, ...)
18
29
  :return: the indexer class
19
30
  """
20
31
  if name not in INDEXER_CLASSES:
@@ -26,7 +37,10 @@ def get_indexer(index_type: str, **kwargs) -> Indexer:
26
37
  """
27
38
  Get an indexer by name.
28
39
 
29
- :param name: the name of the indexer
40
+ >>> simple_indexer = get_indexer("simple")
41
+ >>> llm_indexer = get_indexer("llm")
42
+
43
+ :param name: the name of the indexer (simple, llm, ...)
30
44
  :param kwargs: additional arguments to pass to the indexer
31
45
  :return: the indexer
32
46
  """
@@ -74,7 +74,7 @@ class LLMIndexer(Indexer):
74
74
 
75
75
  embeddings_client = Client()
76
76
  config = CollectionConfig(
77
- name=coll_name,
77
+ alias=coll_name,
78
78
  type="Embeddings",
79
79
  attributes={
80
80
  "text": {"range": "string"},
@@ -116,6 +116,7 @@ class LLMIndexer(Indexer):
116
116
  embeddings_collection.insert(
117
117
  {"text": uncached_texts[i], "embedding": embeddings[index], "model_id": model_id}
118
118
  )
119
+ embeddings_collection.commit()
119
120
  else:
120
121
  logger.info(f"Embedding {len(texts)} texts")
121
122
  embeddings = model.embed_multi(texts)
@@ -11,11 +11,22 @@ logger = logging.getLogger(__name__)
11
11
 
12
12
 
13
13
  class TemplateSyntaxEnum(str, Enum):
14
+ """
15
+ Template syntax types.
16
+ """
17
+
14
18
  jinja2 = "jinja2"
15
19
  fstring = "fstring"
16
20
 
17
21
 
18
- def cosine_similarity(vector1, vector2):
22
+ def cosine_similarity(vector1, vector2) -> float:
23
+ """
24
+ Calculate the cosine similarity between two vectors
25
+
26
+ :param vector1:
27
+ :param vector2:
28
+ :return:
29
+ """
19
30
  dot_product = np.dot(vector1, vector2)
20
31
  norm1 = np.linalg.norm(vector1)
21
32
  norm2 = np.linalg.norm(vector2)
@@ -24,7 +35,7 @@ def cosine_similarity(vector1, vector2):
24
35
 
25
36
  class Indexer(BaseModel):
26
37
  """
27
- An index operates on a collection in order to search for objects.
38
+ An indexer operates on a collection in order to search for objects.
28
39
  """
29
40
 
30
41
  name: Optional[str] = None
@@ -0,0 +1,37 @@
1
+ import logging
2
+ import shutil
3
+ import tempfile
4
+ from datetime import datetime
5
+ from pathlib import Path
6
+ from typing import Optional
7
+
8
+ # Set up logging
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ def safe_remove_directory(dir_path: Path, no_backup: bool = False) -> Optional[Path]:
13
+ # Ensure the directory exists
14
+ if not dir_path.exists():
15
+ raise FileNotFoundError(f"Directory does not exist: {dir_path}")
16
+ try:
17
+ if no_backup:
18
+ # Move to a temporary directory instead of permanent removal
19
+ with tempfile.TemporaryDirectory() as tmpdir:
20
+ tmp_path = Path(tmpdir) / dir_path.name
21
+ shutil.move(str(dir_path), str(tmp_path))
22
+ logger.info(f"Directory moved to temporary location: {tmp_path}")
23
+ # The directory will be automatically removed when exiting the context manager
24
+ return None
25
+ else:
26
+ # Create a backup directory name with timestamp
27
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
28
+ backup_dir = dir_path.with_name(f"{dir_path.name}_backup_{timestamp}")
29
+
30
+ # Move the directory to the backup location
31
+ shutil.move(str(dir_path), str(backup_dir))
32
+ logger.info(f"Directory backed up to: {backup_dir}")
33
+ return backup_dir
34
+
35
+ except Exception as e:
36
+ logger.error(f"An error occurred: {e}")
37
+ return None
@@ -1,14 +1,22 @@
1
1
  import csv
2
+ import gzip
3
+ import io
2
4
  import json
5
+ import logging
3
6
  import sys
7
+ import tarfile
4
8
  from enum import Enum
5
9
  from io import StringIO
6
10
  from pathlib import Path
7
- from typing import Any, Dict, List, Optional, TextIO, Type, Union
11
+ from typing import IO, Any, Dict, List, Optional, TextIO, Type, Union
8
12
 
9
13
  import pandas as pd
14
+ import pystow
10
15
  import yaml
11
16
  from pydantic import BaseModel
17
+ from tabulate import tabulate
18
+
19
+ logger = logging.getLogger(__name__)
12
20
 
13
21
 
14
22
  class Format(Enum):
@@ -21,12 +29,163 @@ class Format(Enum):
21
29
  YAML = "yaml"
22
30
  TSV = "tsv"
23
31
  CSV = "csv"
32
+ PYTHON = "python"
24
33
  PARQUET = "parquet"
25
34
  FORMATTED = "formatted"
35
+ TABLE = "table"
36
+ SQLDUMP_DUCKDB = "duckdb"
37
+ SQLDUMP_POSTGRES = "postgres"
38
+ DUMP_MONGODB = "mongodb"
39
+
40
+ @classmethod
41
+ def guess_format(cls, file_name: str) -> Optional["Format"]:
42
+ ext = Path(file_name).suffix.lower()
43
+
44
+ format_map = {
45
+ ".json": cls.JSON,
46
+ ".jsonl": cls.JSONL,
47
+ ".yaml": cls.YAML,
48
+ ".yml": cls.YAML,
49
+ ".tsv": cls.TSV,
50
+ ".csv": cls.CSV,
51
+ ".py": cls.PYTHON,
52
+ ".parquet": cls.PARQUET,
53
+ ".pq": cls.PARQUET,
54
+ }
55
+ fmt = format_map.get(ext, None)
56
+ if fmt is None:
57
+ if ext.startswith("."):
58
+ ext = ext[1:]
59
+ if ext in [f.value for f in Format]:
60
+ return Format(ext)
61
+ return fmt
62
+
63
+ def is_dump_format(self):
64
+ return self in [Format.SQLDUMP_DUCKDB, Format.SQLDUMP_POSTGRES, Format.DUMP_MONGODB]
65
+
66
+
67
+ def load_objects_from_url(
68
+ url: str,
69
+ format: Union[Format, str] = None,
70
+ expected_type: Type = None,
71
+ local_path: Optional[str] = None,
72
+ **kwargs,
73
+ ) -> List[Dict[str, Any]]:
74
+ """
75
+ Load objects from a URL in JSON, JSONLines, YAML, CSV, or TSV format.
76
+
77
+ :param url: The URL to the file.
78
+ :param format: The format of the file. Can be a Format enum or a string value.
79
+ :param expected_type: The target type to load the objects into.
80
+ :param local_path: The local path to save the file to.
81
+ :return: A list of dictionaries representing the loaded objects.
82
+ """
83
+ local_path = pystow.ensure("linkml", "linkml-store", url=url)
84
+ logger.info(f"synced to {local_path}")
85
+ objs = load_objects(local_path, format=format, expected_type=expected_type, **kwargs)
86
+ if not objs:
87
+ raise ValueError(f"No objects loaded from URL: {url}")
88
+ return objs
89
+
90
+
91
+ def process_file(
92
+ f: IO, format: Format, expected_type: Optional[Type] = None, header_comment_token: Optional[str] = None
93
+ ) -> List[Dict[str, Any]]:
94
+ """
95
+ Process a single file and return a list of objects.
96
+ """
97
+ if format == Format.JSON:
98
+ objs = json.load(f)
99
+ elif format == Format.JSONL:
100
+ objs = [json.loads(line) for line in f]
101
+ elif format == Format.YAML:
102
+ if expected_type and expected_type == list: # noqa E721
103
+ objs = list(yaml.safe_load_all(f))
104
+ else:
105
+ objs = yaml.safe_load(f)
106
+ elif format in [Format.TSV, Format.CSV]:
107
+ if header_comment_token:
108
+ while True:
109
+ pos = f.tell()
110
+ line = f.readline()
111
+ if not line.startswith(header_comment_token):
112
+ f.seek(pos)
113
+ break
114
+ delimiter = "\t" if format == Format.TSV else ","
115
+ reader = csv.DictReader(f, delimiter=delimiter)
116
+ objs = list(reader)
117
+ elif format == Format.PARQUET:
118
+ import pyarrow.parquet as pq
119
+
120
+ table = pq.read_table(f)
121
+ objs = table.to_pandas().to_dict(orient="records")
122
+ elif format in [Format.PYTHON, Format.FORMATTED, Format.TABLE]:
123
+ raise ValueError(f"Format {format} is not supported for loading objects")
124
+ else:
125
+ raise ValueError(f"Unsupported file format: {format}")
126
+
127
+ if not isinstance(objs, list):
128
+ objs = [objs]
129
+ return objs
26
130
 
27
131
 
28
132
  def load_objects(
29
- file_path: Union[str, Path], format: Union[Format, str] = None, expected_type: Type = None
133
+ file_path: Union[str, Path],
134
+ format: Optional[Union[Format, str]] = None,
135
+ compression: Optional[str] = None,
136
+ expected_type: Optional[Type] = None,
137
+ header_comment_token: Optional[str] = None,
138
+ ) -> List[Dict[str, Any]]:
139
+ """
140
+ Load objects from a file or archive in supported formats.
141
+ For tgz archives, it processes all files and concatenates the results.
142
+
143
+ :param file_path: The path to the file or archive.
144
+ :param format: The format of the file. Can be a Format enum or a string value.
145
+ :param compression: The compression type. Supports 'gz' for gzip and 'tgz' for tar.gz.
146
+ :param expected_type: The target type to load the objects into, e.g. list
147
+ :param header_comment_token: Token used for header comments to be skipped
148
+ :return: A list of dictionaries representing the loaded objects.
149
+ """
150
+ if isinstance(file_path, Path):
151
+ file_path = str(file_path)
152
+
153
+ if isinstance(format, str):
154
+ format = Format(format)
155
+
156
+ all_objects = []
157
+
158
+ if compression == "tgz":
159
+ with tarfile.open(file_path, "r:gz") as tar:
160
+ for member in tar.getmembers():
161
+ if member.isfile():
162
+ f = tar.extractfile(member)
163
+ if f:
164
+ content = io.TextIOWrapper(f)
165
+ member_format = Format.guess_format(member.name) if not format else format
166
+ logger.debug(f"Processing tar member {member.name} with format {member_format}")
167
+ all_objects.extend(process_file(content, member_format, expected_type, header_comment_token))
168
+ else:
169
+ if Path(file_path).is_dir():
170
+ raise ValueError(f"{file_path} is a dir, which is invalid for {format}")
171
+ mode = "rb" if format == Format.PARQUET or compression == "gz" else "r"
172
+ open_func = gzip.open if compression == "gz" else open
173
+ format = Format.guess_format(file_path) if not format else format
174
+ with open_func(file_path, mode) if file_path != "-" else sys.stdin as f:
175
+ if compression == "gz" and mode == "r":
176
+ f = io.TextIOWrapper(f)
177
+ all_objects = process_file(f, format, expected_type, header_comment_token)
178
+
179
+ logger.debug(f"Loaded {len(all_objects)} objects from {file_path}")
180
+ return all_objects
181
+
182
+
183
+ def xxxload_objects(
184
+ file_path: Union[str, Path],
185
+ format: Union[Format, str] = None,
186
+ compression: Optional[str] = None,
187
+ expected_type: Type = None,
188
+ header_comment_token: Optional[str] = None,
30
189
  ) -> List[Dict[str, Any]]:
31
190
  """
32
191
  Load objects from a file in JSON, JSONLines, YAML, CSV, or TSV format.
@@ -37,7 +196,7 @@ def load_objects(
37
196
 
38
197
  :param file_path: The path to the file.
39
198
  :param format: The format of the file. Can be a Format enum or a string value.
40
- :param expected_type: The target type to load the objects into.
199
+ :param expected_type: The target type to load the objects into, e.g. list
41
200
  :return: A list of dictionaries representing the loaded objects.
42
201
  """
43
202
  if isinstance(format, str):
@@ -48,6 +207,12 @@ def load_objects(
48
207
 
49
208
  if not format and (file_path.endswith(".parquet") or file_path.endswith(".pq")):
50
209
  format = Format.PARQUET
210
+ if not format and file_path.endswith(".tsv"):
211
+ format = Format.TSV
212
+ if not format and file_path.endswith(".csv"):
213
+ format = Format.CSV
214
+ if not format and file_path.endswith(".py"):
215
+ format = Format.PYTHON
51
216
 
52
217
  mode = "r"
53
218
  if format == Format.PARQUET:
@@ -68,11 +233,29 @@ def load_objects(
68
233
  objs = list(yaml.safe_load_all(f))
69
234
  else:
70
235
  objs = yaml.safe_load(f)
71
- elif format == Format.TSV or (not format and file_path.endswith(".tsv")):
72
- reader = csv.DictReader(f, delimiter="\t")
73
- objs = list(reader)
74
- elif format == Format.CSV or (not format and file_path.endswith(".csv")):
75
- reader = csv.DictReader(f)
236
+ elif format == Format.TSV or format == Format.CSV:
237
+ # Skip initial comment lines if comment_char is set
238
+ if header_comment_token:
239
+ # Store the original position
240
+ original_pos = f.tell()
241
+
242
+ # Read and store lines until we find a non-comment line
243
+ lines = []
244
+ for line in f:
245
+ if not line.startswith(header_comment_token):
246
+ break
247
+ lines.append(line)
248
+
249
+ # Go back to the original position
250
+ f.seek(original_pos)
251
+
252
+ # Skip the comment lines we found
253
+ for _ in lines:
254
+ f.readline()
255
+ if format == Format.TSV:
256
+ reader = csv.DictReader(f, delimiter="\t")
257
+ else:
258
+ reader = csv.DictReader(f)
76
259
  objs = list(reader)
77
260
  elif format == Format.PARQUET:
78
261
  import pyarrow.parquet as pq
@@ -118,7 +301,7 @@ def write_output(
118
301
 
119
302
 
120
303
  def render_output(
121
- data: Union[List[Dict[str, Any]], Dict[str, Any], pd.DataFrame], format: Union[Format, str] = Format.YAML
304
+ data: Union[List[Dict[str, Any]], Dict[str, Any], pd.DataFrame], format: Optional[Union[Format, str]] = Format.YAML
122
305
  ) -> str:
123
306
  """
124
307
  Render output data in JSON, JSONLines, YAML, CSV, or TSV format.
@@ -151,6 +334,9 @@ def render_output(
151
334
  if isinstance(data, pd.DataFrame):
152
335
  data = data.to_dict(orient="records")
153
336
 
337
+ if isinstance(data, dict) and format in [Format.TSV, Format.CSV]:
338
+ data = [data]
339
+
154
340
  if isinstance(data, BaseModel):
155
341
  data = data.model_dump()
156
342
 
@@ -158,6 +344,10 @@ def render_output(
158
344
  return json.dumps(data, indent=2, default=str)
159
345
  elif format == Format.JSONL:
160
346
  return "\n".join(json.dumps(obj) for obj in data)
347
+ elif format == Format.PYTHON:
348
+ return str(data)
349
+ elif format == Format.TABLE:
350
+ return tabulate(pd.DataFrame(data), headers="keys", tablefmt="psql")
161
351
  elif format == Format.YAML:
162
352
  if isinstance(data, list):
163
353
  return yaml.safe_dump_all(data, sort_keys=False)
@@ -210,15 +400,4 @@ def guess_format(path: str) -> Optional[Format]:
210
400
  :param path: The path to the file.
211
401
  :return: The guessed format.
212
402
  """
213
- if path.endswith(".json"):
214
- return Format.JSON
215
- elif path.endswith(".jsonl"):
216
- return Format.JSONL
217
- elif path.endswith(".yaml") or path.endswith(".yml"):
218
- return Format.YAML
219
- elif path.endswith(".tsv"):
220
- return Format.TSV
221
- elif path.endswith(".csv"):
222
- return Format.CSV
223
- else:
224
- return None
403
+ return Format.guess_format(path)