linkml-store 0.1.10__py3-none-any.whl → 0.1.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of linkml-store might be problematic. Click here for more details.

@@ -26,6 +26,8 @@ TYPE_MAP = {
26
26
  "JSON": "Any",
27
27
  }
28
28
 
29
+ MEMORY_HANDLE = "duckdb:///:memory:"
30
+
29
31
 
30
32
  logger = logging.getLogger(__name__)
31
33
 
@@ -49,7 +51,7 @@ class DuckDBDatabase(Database):
49
51
 
50
52
  def __init__(self, handle: Optional[str] = None, recreate_if_exists: bool = False, **kwargs):
51
53
  if handle is None:
52
- handle = "duckdb:///:memory:"
54
+ handle = MEMORY_HANDLE
53
55
  if recreate_if_exists:
54
56
  path = Path(handle.replace("duckdb:///", ""))
55
57
  if path.exists():
@@ -76,6 +78,17 @@ class DuckDBDatabase(Database):
76
78
  def close(self, **kwargs):
77
79
  self.engine.dispose()
78
80
 
81
+ def drop(self, missing_ok=True, **kwargs):
82
+ self.close()
83
+ if self.handle == MEMORY_HANDLE:
84
+ return
85
+ path = Path(self.handle.replace("duckdb:///", ""))
86
+ if path.exists():
87
+ path.unlink()
88
+ else:
89
+ if not missing_ok:
90
+ raise FileNotFoundError(f"Database file not found: {path}")
91
+
79
92
  def query(self, query: Query, **kwargs) -> QueryResult:
80
93
  json_encoded_cols = []
81
94
  if query.from_table:
@@ -94,7 +107,8 @@ class DuckDBDatabase(Database):
94
107
  if sv:
95
108
  cd = None
96
109
  for c in self._collections.values():
97
- if c.name == query.from_table or c.metadata.alias == query.from_table:
110
+ # if c.name == query.from_table or c.metadata.alias == query.from_table:
111
+ if c.alias == query.from_table or c.target_class_name == query.from_table:
98
112
  cd = c.class_definition()
99
113
  break
100
114
  if cd:
@@ -31,7 +31,7 @@ class FileSystemCollection(Collection[DatabaseType]):
31
31
 
32
32
  @property
33
33
  def path_to_file(self):
34
- return Path(self.parent.directory_path) / f"{self.name}.{self.file_format}"
34
+ return Path(self.parent.directory_path) / f"{self.alias}.{self.file_format}"
35
35
 
36
36
  @property
37
37
  def objects_as_list(self) -> List[OBJECT]:
@@ -150,13 +150,20 @@ class FileSystemCollection(Collection[DatabaseType]):
150
150
  curr_objects = [o for o in self.objects_as_list if not matches(o)]
151
151
  self._set_objects(curr_objects)
152
152
 
153
- def query(self, query: Query, **kwargs) -> QueryResult:
154
-
153
+ def query(self, query: Query, limit: Optional[int] = None, offset: Optional[int] = None, **kwargs) -> QueryResult:
154
+ limit = limit or query.limit
155
+ offset = offset or query.offset
156
+ if offset is None:
157
+ offset = 0
155
158
  where = query.where_clause or {}
156
159
  match = mongo_query_to_match_function(where)
157
160
  rows = [o for o in self.objects_as_list if match(o)]
158
161
  count = len(rows)
159
- return QueryResult(query=query, num_rows=count, rows=rows)
162
+ if limit is None or limit < 0:
163
+ limit = count
164
+ # TODO: avoid recalculating
165
+ returned_row = rows[offset : offset + limit]
166
+ return QueryResult(query=query, num_rows=count, rows=returned_row)
160
167
 
161
168
  def query_facets(
162
169
  self, where: Dict = None, facet_columns: List[str] = None, facet_limit=DEFAULT_FACET_LIMIT, **kwargs
@@ -9,6 +9,7 @@ from linkml_runtime import SchemaView
9
9
  from linkml_store.api import Database
10
10
  from linkml_store.api.config import DatabaseConfig
11
11
  from linkml_store.api.stores.filesystem.filesystem_collection import FileSystemCollection
12
+ from linkml_store.utils.file_utils import safe_remove_directory
12
13
  from linkml_store.utils.format_utils import Format, load_objects
13
14
 
14
15
  logger = logging.getLogger(__name__)
@@ -20,6 +21,8 @@ class FileSystemDatabase(Database):
20
21
  directory_path: Optional[Path] = None
21
22
  default_file_format: Optional[str] = None
22
23
 
24
+ no_backup_on_drop: bool = False
25
+
23
26
  def __init__(self, handle: Optional[str] = None, **kwargs):
24
27
  handle = handle.replace("file:", "")
25
28
  if handle.startswith("//"):
@@ -43,6 +46,12 @@ class FileSystemDatabase(Database):
43
46
  def close(self, **kwargs):
44
47
  pass
45
48
 
49
+ def drop(self, no_backup=False, **kwargs):
50
+ self.close()
51
+ path = self.directory_path
52
+ if path.exists():
53
+ safe_remove_directory(path, no_backup=self.no_backup_on_drop or no_backup)
54
+
46
55
  def init_collections(self):
47
56
  metadata = self.metadata
48
57
  if self._collections is None:
@@ -63,7 +72,7 @@ class FileSystemDatabase(Database):
63
72
  self._collections[n] = collection
64
73
  collection._set_objects(objs)
65
74
 
66
- def induce_schema_view(self) -> SchemaView:
75
+ def xxxinduce_schema_view(self) -> SchemaView:
67
76
  logger.info(f"Inducing schema view for {self.handle}")
68
77
  sb = SchemaBuilder()
69
78
 
@@ -23,11 +23,15 @@ class MongoDBCollection(Collection):
23
23
 
24
24
  @property
25
25
  def mongo_collection(self) -> MongoCollection:
26
- if not self.name:
26
+ # collection_name = self.alias or self.name
27
+ collection_name = self.alias
28
+ if not collection_name:
27
29
  raise ValueError("Collection name not set")
28
- collection_name = self.alias or self.name
29
30
  return self.parent.native_db[collection_name]
30
31
 
32
+ def _check_if_initialized(self) -> bool:
33
+ return self.alias in self.parent.native_db.list_collection_names()
34
+
31
35
  def insert(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
32
36
  if not isinstance(objs, list):
33
37
  objs = [objs]
@@ -3,9 +3,6 @@
3
3
  import logging
4
4
  from typing import Optional
5
5
 
6
- from linkml_runtime import SchemaView
7
- from linkml_runtime.linkml_model import ClassDefinition, SlotDefinition
8
- from linkml_runtime.utils.schema_builder import SchemaBuilder
9
6
  from pymongo import MongoClient
10
7
  from pymongo.database import Database as NativeDatabase
11
8
 
@@ -63,10 +60,9 @@ class MongoDBDatabase(Database):
63
60
  self._native_client.close()
64
61
 
65
62
  def drop(self, **kwargs):
66
- self.native_client.drop_database(self.metadata.alias)
63
+ self.native_client.drop_database(self.native_db.name)
67
64
 
68
65
  def query(self, query: Query, **kwargs) -> QueryResult:
69
- # TODO: DRY
70
66
  if query.from_table:
71
67
  collection = self.get_collection(query.from_table)
72
68
  return collection.query(query, **kwargs)
@@ -81,34 +77,3 @@ class MongoDBDatabase(Database):
81
77
  if collection_name not in self._collections:
82
78
  collection = MongoDBCollection(name=collection_name, parent=self)
83
79
  self._collections[collection_name] = collection
84
-
85
- def induce_schema_view(self) -> SchemaView:
86
- logger.info(f"Inducing schema view for {self.handle}")
87
- sb = SchemaBuilder()
88
- schema = sb.schema
89
-
90
- for collection_name in self.native_db.list_collection_names():
91
- sb.add_class(collection_name)
92
- mongo_collection = self.native_db[collection_name]
93
- sample_doc = mongo_collection.find_one()
94
- if sample_doc:
95
- for field, value in sample_doc.items():
96
- if field == "_id":
97
- continue
98
- sd = SlotDefinition(field)
99
- if isinstance(value, list):
100
- sd.multivalued = True
101
- if isinstance(value, dict):
102
- sd.inlined = True
103
- sb.schema.classes[collection_name].attributes[sd.name] = sd
104
-
105
- sb.add_defaults()
106
- for cls_name in schema.classes:
107
- if cls_name in self.metadata.collections:
108
- collection_metadata = self.metadata.collections[cls_name]
109
- if collection_metadata.attributes:
110
- del schema.classes[cls_name]
111
- cls = ClassDefinition(name=collection_metadata.type, attributes=collection_metadata.attributes)
112
- schema.classes[cls.name] = cls
113
-
114
- return SchemaView(schema)
@@ -18,7 +18,7 @@ class SolrCollection(Collection):
18
18
  @property
19
19
  def _collection_base(self) -> str:
20
20
  if self.parent.use_cores:
21
- base_url = f"{self.parent.base_url}/{self.name}"
21
+ base_url = f"{self.parent.base_url}/{self.alias}"
22
22
  else:
23
23
  base_url = self.parent.base_url
24
24
  return base_url
@@ -37,7 +37,7 @@ class SolrCollection(Collection):
37
37
  if not qfs:
38
38
  raise ValueError("No searchable slots configured for Solr collection")
39
39
  solr_query = self._build_solr_query(where, search_term=query, extra={"defType": index_name, "qf": qfs})
40
- logger.info(f"Querying Solr collection {self.name} with query: {solr_query}")
40
+ logger.info(f"Querying Solr collection {self.alias} with query: {solr_query}")
41
41
 
42
42
  response = requests.get(f"{self._collection_base}/select", params=solr_query)
43
43
  response.raise_for_status()
@@ -50,7 +50,7 @@ class SolrCollection(Collection):
50
50
 
51
51
  def query(self, query: Query, **kwargs) -> QueryResult:
52
52
  solr_query = self._build_solr_query(query)
53
- logger.info(f"Querying Solr collection {self.name} with query: {solr_query}")
53
+ logger.info(f"Querying Solr collection {self.alias} with query: {solr_query}")
54
54
 
55
55
  response = requests.get(f"{self._collection_base}/select", params=solr_query)
56
56
  response.raise_for_status()
@@ -69,7 +69,7 @@ class SolrCollection(Collection):
69
69
  solr_query["facet.field"] = facet_columns
70
70
  solr_query["facet.limit"] = facet_limit
71
71
 
72
- logger.info(f"Querying Solr collection {self.name} for facets with query: {solr_query}")
72
+ logger.info(f"Querying Solr collection {self.alias} for facets with query: {solr_query}")
73
73
 
74
74
  response = requests.get(f"{self._collection_base}/select", params=solr_query)
75
75
  response.raise_for_status()
linkml_store/cli.py CHANGED
@@ -16,6 +16,7 @@ from linkml_store.index.implementations.simple_indexer import SimpleIndexer
16
16
  from linkml_store.index.indexer import Indexer
17
17
  from linkml_store.utils.format_utils import Format, guess_format, load_objects, render_output, write_output
18
18
  from linkml_store.utils.object_utils import object_path_update
19
+ from linkml_store.utils.pandas_utils import facet_summary_to_dataframe_unmelted
19
20
 
20
21
  index_type_option = click.option(
21
22
  "--index-type",
@@ -87,6 +88,7 @@ include_internal_option = click.option("--include-internal/--no-include-internal
87
88
  @click.option("--set", help="Metadata settings in the form PATHEXPR=value", multiple=True)
88
89
  @click.option("-v", "--verbose", count=True)
89
90
  @click.option("-q", "--quiet/--no-quiet")
91
+ @click.option("--base-dir", "-B", help="Base directory for the client configuration")
90
92
  @click.option(
91
93
  "--stacktrace/--no-stacktrace",
92
94
  default=False,
@@ -94,7 +96,7 @@ include_internal_option = click.option("--include-internal/--no-include-internal
94
96
  help="If set then show full stacktrace on error",
95
97
  )
96
98
  @click.pass_context
97
- def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection, config, set):
99
+ def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection, config, set, **kwargs):
98
100
  """A CLI for interacting with the linkml-store."""
99
101
  if not stacktrace:
100
102
  sys.tracebacklimit = 0
@@ -117,7 +119,7 @@ def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection,
117
119
  if quiet:
118
120
  logger.setLevel(logging.ERROR)
119
121
  ctx.ensure_object(dict)
120
- client = Client().from_config(config) if config else Client()
122
+ client = Client().from_config(config, **kwargs) if config else Client()
121
123
  settings = ContextSettings(client=client, database_name=database, collection_name=collection)
122
124
  ctx.obj["settings"] = settings
123
125
  # DEPRECATED
@@ -150,7 +152,7 @@ def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection,
150
152
  # raise ValueError("Collection must be specified if there are multiple collections.")
151
153
  if settings.database and settings.database.list_collections():
152
154
  collection = settings.database.list_collections()[0]
153
- settings.collection_name = collection.name
155
+ settings.collection_name = collection.alias
154
156
 
155
157
 
156
158
  @cli.command()
@@ -180,15 +182,15 @@ def insert(ctx, files, object, format):
180
182
  objects = load_objects(file_path, format=format)
181
183
  else:
182
184
  objects = load_objects(file_path)
183
- logger.info(f"Inserting {len(objects)} objects from {file_path} into collection '{collection.name}'.")
185
+ logger.info(f"Inserting {len(objects)} objects from {file_path} into collection '{collection.alias}'.")
184
186
  collection.insert(objects)
185
- click.echo(f"Inserted {len(objects)} objects from {file_path} into collection '{collection.name}'.")
187
+ click.echo(f"Inserted {len(objects)} objects from {file_path} into collection '{collection.alias}'.")
186
188
  if object:
187
189
  for object_str in object:
188
190
  logger.info(f"Parsing: {object_str}")
189
191
  objects = yaml.safe_load(object_str)
190
192
  collection.insert(objects)
191
- click.echo(f"Inserted {len(objects)} objects from {object_str} into collection '{collection.name}'.")
193
+ click.echo(f"Inserted {len(objects)} objects from {object_str} into collection '{collection.alias}'.")
192
194
  collection.commit()
193
195
 
194
196
 
@@ -324,7 +326,7 @@ def query(ctx, where, limit, output_type, output):
324
326
  """
325
327
  collection = ctx.obj["settings"].collection
326
328
  where_clause = yaml.safe_load(where) if where else None
327
- query = Query(from_table=collection.name, where_clause=where_clause, limit=limit)
329
+ query = Query(from_table=collection.alias, where_clause=where_clause, limit=limit)
328
330
  result = collection.query(query)
329
331
  output_data = render_output(result.rows, output_type)
330
332
  if output:
@@ -341,7 +343,7 @@ def query(ctx, where, limit, output_type, output):
341
343
  def list_collections(ctx, **kwargs):
342
344
  db = ctx.obj["settings"].database
343
345
  for collection in db.list_collections(**kwargs):
344
- click.echo(collection.name)
346
+ click.echo(collection.alias)
345
347
  click.echo(render_output(collection.metadata))
346
348
 
347
349
 
@@ -351,8 +353,9 @@ def list_collections(ctx, **kwargs):
351
353
  @click.option("--output-type", "-O", type=format_choice, default="json", help="Output format")
352
354
  @click.option("--output", "-o", type=click.Path(), help="Output file path")
353
355
  @click.option("--columns", "-S", help="Columns to facet on")
356
+ @click.option("--wide/--no-wide", "-U/--no-U", default=False, show_default=True, help="Wide table")
354
357
  @click.pass_context
355
- def fq(ctx, where, limit, columns, output_type, output):
358
+ def fq(ctx, where, limit, columns, output_type, wide, output):
356
359
  """
357
360
  Query facets from the specified collection.
358
361
 
@@ -379,11 +382,22 @@ def fq(ctx, where, limit, columns, output_type, output):
379
382
  return "+".join([str(x) for x in key])
380
383
  return key
381
384
 
382
- count_dict = {}
383
- for key, value in results.items():
384
- value_as_dict = {_untuple(v[0:-1]): v[-1] for v in value}
385
- count_dict[_untuple(key)] = value_as_dict
386
- output_data = render_output(count_dict, output_type)
385
+ if wide:
386
+ results_obj = facet_summary_to_dataframe_unmelted(results)
387
+ else:
388
+ if output_type == Format.PYTHON.value:
389
+ results_obj = results
390
+ elif output_type in [Format.TSV.value, Format.CSV.value]:
391
+ results_obj = []
392
+ for fc, data in results.items():
393
+ for v, c in data:
394
+ results_obj.append({"facet": fc, "value": v, "count": c})
395
+ else:
396
+ results_obj = {}
397
+ for key, value in results.items():
398
+ value_as_dict = {_untuple(v[0:-1]): v[-1] for v in value}
399
+ results_obj[_untuple(key)] = value_as_dict
400
+ output_data = render_output(results_obj, output_type)
387
401
  if output:
388
402
  with open(output, "w") as f:
389
403
  f.write(output_data)
@@ -403,14 +417,17 @@ def _get_index(index_type=None, **kwargs) -> Indexer:
403
417
  @click.option("--where", "-w", type=click.STRING, help="WHERE clause for the query")
404
418
  @click.option("--output-type", "-O", type=format_choice, default=Format.FORMATTED.value, help="Output format")
405
419
  @click.option("--output", "-o", type=click.Path(), help="Output file path")
420
+ @click.option(
421
+ "--limit", "-l", default=-1, show_default=True, type=click.INT, help="Maximum number of results to return"
422
+ )
406
423
  @click.pass_context
407
- def describe(ctx, where, output_type, output):
424
+ def describe(ctx, where, output_type, output, limit):
408
425
  """
409
426
  Describe the collection schema.
410
427
  """
411
428
  where_clause = yaml.safe_load(where) if where else None
412
429
  collection = ctx.obj["settings"].collection
413
- df = collection.find(where_clause, limit=1).rows_dataframe
430
+ df = collection.find(where_clause, limit=limit).rows_dataframe
414
431
  write_output(df.describe(include="all").transpose(), output_type, target=output)
415
432
 
416
433
 
@@ -468,7 +485,7 @@ def search(ctx, search_term, where, limit, index_type, output_type, output, auto
468
485
  """Search objects in the specified collection."""
469
486
  collection = ctx.obj["settings"].collection
470
487
  ix = get_indexer(index_type)
471
- logger.info(f"Attaching index to collection {collection.name}: {ix.model_dump()}")
488
+ logger.info(f"Attaching index to collection {collection.alias}: {ix.model_dump()}")
472
489
  collection.attach_indexer(ix, auto_index=auto_index)
473
490
  result = collection.search(search_term, where=where, limit=limit)
474
491
  output_data = render_output([{"score": row[0], **row[1]} for row in result.ranked_rows], output_type)
@@ -498,6 +515,7 @@ def indexes(ctx):
498
515
  def validate(ctx, output_type, output):
499
516
  """Validate objects in the specified collection."""
500
517
  collection = ctx.obj["settings"].collection
518
+ logger.info(f"Validating collection {collection.alias}")
501
519
  validation_results = [json_dumper.to_dict(x) for x in collection.iter_validate_collection()]
502
520
  output_data = render_output(validation_results, output_type)
503
521
  if output:
@@ -1,3 +1,14 @@
1
+ """
2
+ Indexers package.
3
+
4
+ Indexers allow indexes to be added to existing :class:`Collection` objects.
5
+
6
+ Current two are supported:
7
+
8
+ * simple: :class:`SimpleIndexer`
9
+ * llm: :class:`LLMIndexer`
10
+ """
11
+
1
12
  from typing import Type
2
13
 
3
14
  from linkml_store.index.implementations.llm_indexer import LLMIndexer
@@ -14,7 +25,7 @@ def get_indexer_class(name: str) -> Type[Indexer]:
14
25
  """
15
26
  Get an indexer class by name.
16
27
 
17
- :param name: the name of the indexer
28
+ :param name: the name of the indexer (simple, llm, ...)
18
29
  :return: the indexer class
19
30
  """
20
31
  if name not in INDEXER_CLASSES:
@@ -26,7 +37,10 @@ def get_indexer(index_type: str, **kwargs) -> Indexer:
26
37
  """
27
38
  Get an indexer by name.
28
39
 
29
- :param name: the name of the indexer
40
+ >>> simple_indexer = get_indexer("simple")
41
+ >>> llm_indexer = get_indexer("llm")
42
+
43
+ :param name: the name of the indexer (simple, llm, ...)
30
44
  :param kwargs: additional arguments to pass to the indexer
31
45
  :return: the indexer
32
46
  """
@@ -74,7 +74,7 @@ class LLMIndexer(Indexer):
74
74
 
75
75
  embeddings_client = Client()
76
76
  config = CollectionConfig(
77
- name=coll_name,
77
+ alias=coll_name,
78
78
  type="Embeddings",
79
79
  attributes={
80
80
  "text": {"range": "string"},
@@ -116,6 +116,7 @@ class LLMIndexer(Indexer):
116
116
  embeddings_collection.insert(
117
117
  {"text": uncached_texts[i], "embedding": embeddings[index], "model_id": model_id}
118
118
  )
119
+ embeddings_collection.commit()
119
120
  else:
120
121
  logger.info(f"Embedding {len(texts)} texts")
121
122
  embeddings = model.embed_multi(texts)
@@ -11,11 +11,22 @@ logger = logging.getLogger(__name__)
11
11
 
12
12
 
13
13
  class TemplateSyntaxEnum(str, Enum):
14
+ """
15
+ Template syntax types.
16
+ """
17
+
14
18
  jinja2 = "jinja2"
15
19
  fstring = "fstring"
16
20
 
17
21
 
18
- def cosine_similarity(vector1, vector2):
22
+ def cosine_similarity(vector1, vector2) -> float:
23
+ """
24
+ Calculate the cosine similarity between two vectors
25
+
26
+ :param vector1:
27
+ :param vector2:
28
+ :return:
29
+ """
19
30
  dot_product = np.dot(vector1, vector2)
20
31
  norm1 = np.linalg.norm(vector1)
21
32
  norm2 = np.linalg.norm(vector2)
@@ -24,7 +35,7 @@ def cosine_similarity(vector1, vector2):
24
35
 
25
36
  class Indexer(BaseModel):
26
37
  """
27
- An index operates on a collection in order to search for objects.
38
+ An indexer operates on a collection in order to search for objects.
28
39
  """
29
40
 
30
41
  name: Optional[str] = None
@@ -0,0 +1,37 @@
1
+ import logging
2
+ import shutil
3
+ import tempfile
4
+ from datetime import datetime
5
+ from pathlib import Path
6
+ from typing import Optional
7
+
8
+ # Set up logging
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ def safe_remove_directory(dir_path: Path, no_backup: bool = False) -> Optional[Path]:
13
+ # Ensure the directory exists
14
+ if not dir_path.exists():
15
+ raise FileNotFoundError(f"Directory does not exist: {dir_path}")
16
+ try:
17
+ if no_backup:
18
+ # Move to a temporary directory instead of permanent removal
19
+ with tempfile.TemporaryDirectory() as tmpdir:
20
+ tmp_path = Path(tmpdir) / dir_path.name
21
+ shutil.move(str(dir_path), str(tmp_path))
22
+ logger.info(f"Directory moved to temporary location: {tmp_path}")
23
+ # The directory will be automatically removed when exiting the context manager
24
+ return None
25
+ else:
26
+ # Create a backup directory name with timestamp
27
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
28
+ backup_dir = dir_path.with_name(f"{dir_path.name}_backup_{timestamp}")
29
+
30
+ # Move the directory to the backup location
31
+ shutil.move(str(dir_path), str(backup_dir))
32
+ logger.info(f"Directory backed up to: {backup_dir}")
33
+ return backup_dir
34
+
35
+ except Exception as e:
36
+ logger.error(f"An error occurred: {e}")
37
+ return None
@@ -7,8 +7,10 @@ from pathlib import Path
7
7
  from typing import Any, Dict, List, Optional, TextIO, Type, Union
8
8
 
9
9
  import pandas as pd
10
+ import pystow
10
11
  import yaml
11
12
  from pydantic import BaseModel
13
+ from tabulate import tabulate
12
14
 
13
15
 
14
16
  class Format(Enum):
@@ -21,12 +23,40 @@ class Format(Enum):
21
23
  YAML = "yaml"
22
24
  TSV = "tsv"
23
25
  CSV = "csv"
26
+ PYTHON = "python"
24
27
  PARQUET = "parquet"
25
28
  FORMATTED = "formatted"
29
+ TABLE = "table"
30
+
31
+
32
+ def load_objects_from_url(
33
+ url: str,
34
+ format: Union[Format, str] = None,
35
+ expected_type: Type = None,
36
+ local_path: Optional[str] = None,
37
+ **kwargs,
38
+ ) -> List[Dict[str, Any]]:
39
+ """
40
+ Load objects from a URL in JSON, JSONLines, YAML, CSV, or TSV format.
41
+
42
+ :param url: The URL to the file.
43
+ :param format: The format of the file. Can be a Format enum or a string value.
44
+ :param expected_type: The target type to load the objects into.
45
+ :param local_path: The local path to save the file to.
46
+ :return: A list of dictionaries representing the loaded objects.
47
+ """
48
+ local_path = pystow.ensure("linkml", "linkml-store", url=url)
49
+ objs = load_objects(local_path, format=format, expected_type=expected_type, **kwargs)
50
+ if not objs:
51
+ raise ValueError(f"No objects loaded from URL: {url}")
52
+ return objs
26
53
 
27
54
 
28
55
  def load_objects(
29
- file_path: Union[str, Path], format: Union[Format, str] = None, expected_type: Type = None
56
+ file_path: Union[str, Path],
57
+ format: Union[Format, str] = None,
58
+ expected_type: Type = None,
59
+ header_comment_token: Optional[str] = None,
30
60
  ) -> List[Dict[str, Any]]:
31
61
  """
32
62
  Load objects from a file in JSON, JSONLines, YAML, CSV, or TSV format.
@@ -37,7 +67,7 @@ def load_objects(
37
67
 
38
68
  :param file_path: The path to the file.
39
69
  :param format: The format of the file. Can be a Format enum or a string value.
40
- :param expected_type: The target type to load the objects into.
70
+ :param expected_type: The target type to load the objects into, e.g. list
41
71
  :return: A list of dictionaries representing the loaded objects.
42
72
  """
43
73
  if isinstance(format, str):
@@ -48,6 +78,12 @@ def load_objects(
48
78
 
49
79
  if not format and (file_path.endswith(".parquet") or file_path.endswith(".pq")):
50
80
  format = Format.PARQUET
81
+ if not format and file_path.endswith(".tsv"):
82
+ format = Format.TSV
83
+ if not format and file_path.endswith(".csv"):
84
+ format = Format.CSV
85
+ if not format and file_path.endswith(".py"):
86
+ format = Format.PYTHON
51
87
 
52
88
  mode = "r"
53
89
  if format == Format.PARQUET:
@@ -68,11 +104,29 @@ def load_objects(
68
104
  objs = list(yaml.safe_load_all(f))
69
105
  else:
70
106
  objs = yaml.safe_load(f)
71
- elif format == Format.TSV or (not format and file_path.endswith(".tsv")):
72
- reader = csv.DictReader(f, delimiter="\t")
73
- objs = list(reader)
74
- elif format == Format.CSV or (not format and file_path.endswith(".csv")):
75
- reader = csv.DictReader(f)
107
+ elif format == Format.TSV or format == Format.CSV:
108
+ # Skip initial comment lines if comment_char is set
109
+ if header_comment_token:
110
+ # Store the original position
111
+ original_pos = f.tell()
112
+
113
+ # Read and store lines until we find a non-comment line
114
+ lines = []
115
+ for line in f:
116
+ if not line.startswith(header_comment_token):
117
+ break
118
+ lines.append(line)
119
+
120
+ # Go back to the original position
121
+ f.seek(original_pos)
122
+
123
+ # Skip the comment lines we found
124
+ for _ in lines:
125
+ f.readline()
126
+ if format == Format.TSV:
127
+ reader = csv.DictReader(f, delimiter="\t")
128
+ else:
129
+ reader = csv.DictReader(f)
76
130
  objs = list(reader)
77
131
  elif format == Format.PARQUET:
78
132
  import pyarrow.parquet as pq
@@ -151,6 +205,9 @@ def render_output(
151
205
  if isinstance(data, pd.DataFrame):
152
206
  data = data.to_dict(orient="records")
153
207
 
208
+ if isinstance(data, dict) and format in [Format.TSV, Format.CSV]:
209
+ data = [data]
210
+
154
211
  if isinstance(data, BaseModel):
155
212
  data = data.model_dump()
156
213
 
@@ -158,6 +215,10 @@ def render_output(
158
215
  return json.dumps(data, indent=2, default=str)
159
216
  elif format == Format.JSONL:
160
217
  return "\n".join(json.dumps(obj) for obj in data)
218
+ elif format == Format.PYTHON:
219
+ return str(data)
220
+ elif format == Format.TABLE:
221
+ return tabulate(pd.DataFrame(data), headers="keys", tablefmt="psql")
161
222
  elif format == Format.YAML:
162
223
  if isinstance(data, list):
163
224
  return yaml.safe_dump_all(data, sort_keys=False)