linkml-store 0.1.11__py3-none-any.whl → 0.1.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of linkml-store might be problematic. Click here for more details.

@@ -1,10 +1,14 @@
1
1
  import csv
2
+ import gzip
3
+ import io
2
4
  import json
5
+ import logging
3
6
  import sys
7
+ import tarfile
4
8
  from enum import Enum
5
9
  from io import StringIO
6
10
  from pathlib import Path
7
- from typing import Any, Dict, List, Optional, TextIO, Type, Union
11
+ from typing import IO, Any, Dict, List, Optional, TextIO, Type, Union
8
12
 
9
13
  import pandas as pd
10
14
  import pystow
@@ -12,6 +16,8 @@ import yaml
12
16
  from pydantic import BaseModel
13
17
  from tabulate import tabulate
14
18
 
19
+ logger = logging.getLogger(__name__)
20
+
15
21
 
16
22
  class Format(Enum):
17
23
  """
@@ -27,6 +33,35 @@ class Format(Enum):
27
33
  PARQUET = "parquet"
28
34
  FORMATTED = "formatted"
29
35
  TABLE = "table"
36
+ SQLDUMP_DUCKDB = "duckdb"
37
+ SQLDUMP_POSTGRES = "postgres"
38
+ DUMP_MONGODB = "mongodb"
39
+
40
+ @classmethod
41
+ def guess_format(cls, file_name: str) -> Optional["Format"]:
42
+ ext = Path(file_name).suffix.lower()
43
+
44
+ format_map = {
45
+ ".json": cls.JSON,
46
+ ".jsonl": cls.JSONL,
47
+ ".yaml": cls.YAML,
48
+ ".yml": cls.YAML,
49
+ ".tsv": cls.TSV,
50
+ ".csv": cls.CSV,
51
+ ".py": cls.PYTHON,
52
+ ".parquet": cls.PARQUET,
53
+ ".pq": cls.PARQUET,
54
+ }
55
+ fmt = format_map.get(ext, None)
56
+ if fmt is None:
57
+ if ext.startswith("."):
58
+ ext = ext[1:]
59
+ if ext in [f.value for f in Format]:
60
+ return Format(ext)
61
+ return fmt
62
+
63
+ def is_dump_format(self):
64
+ return self in [Format.SQLDUMP_DUCKDB, Format.SQLDUMP_POSTGRES, Format.DUMP_MONGODB]
30
65
 
31
66
 
32
67
  def load_objects_from_url(
@@ -46,15 +81,109 @@ def load_objects_from_url(
46
81
  :return: A list of dictionaries representing the loaded objects.
47
82
  """
48
83
  local_path = pystow.ensure("linkml", "linkml-store", url=url)
84
+ logger.info(f"synced to {local_path}")
49
85
  objs = load_objects(local_path, format=format, expected_type=expected_type, **kwargs)
50
86
  if not objs:
51
87
  raise ValueError(f"No objects loaded from URL: {url}")
52
88
  return objs
53
89
 
54
90
 
91
+ def process_file(
92
+ f: IO, format: Format, expected_type: Optional[Type] = None, header_comment_token: Optional[str] = None
93
+ ) -> List[Dict[str, Any]]:
94
+ """
95
+ Process a single file and return a list of objects.
96
+ """
97
+ if format == Format.JSON:
98
+ objs = json.load(f)
99
+ elif format == Format.JSONL:
100
+ objs = [json.loads(line) for line in f]
101
+ elif format == Format.YAML:
102
+ if expected_type and expected_type == list: # noqa E721
103
+ objs = list(yaml.safe_load_all(f))
104
+ else:
105
+ objs = yaml.safe_load(f)
106
+ elif format in [Format.TSV, Format.CSV]:
107
+ if header_comment_token:
108
+ while True:
109
+ pos = f.tell()
110
+ line = f.readline()
111
+ if not line.startswith(header_comment_token):
112
+ f.seek(pos)
113
+ break
114
+ delimiter = "\t" if format == Format.TSV else ","
115
+ reader = csv.DictReader(f, delimiter=delimiter)
116
+ objs = list(reader)
117
+ elif format == Format.PARQUET:
118
+ import pyarrow.parquet as pq
119
+
120
+ table = pq.read_table(f)
121
+ objs = table.to_pandas().to_dict(orient="records")
122
+ elif format in [Format.PYTHON, Format.FORMATTED, Format.TABLE]:
123
+ raise ValueError(f"Format {format} is not supported for loading objects")
124
+ else:
125
+ raise ValueError(f"Unsupported file format: {format}")
126
+
127
+ if not isinstance(objs, list):
128
+ objs = [objs]
129
+ return objs
130
+
131
+
55
132
  def load_objects(
133
+ file_path: Union[str, Path],
134
+ format: Optional[Union[Format, str]] = None,
135
+ compression: Optional[str] = None,
136
+ expected_type: Optional[Type] = None,
137
+ header_comment_token: Optional[str] = None,
138
+ ) -> List[Dict[str, Any]]:
139
+ """
140
+ Load objects from a file or archive in supported formats.
141
+ For tgz archives, it processes all files and concatenates the results.
142
+
143
+ :param file_path: The path to the file or archive.
144
+ :param format: The format of the file. Can be a Format enum or a string value.
145
+ :param compression: The compression type. Supports 'gz' for gzip and 'tgz' for tar.gz.
146
+ :param expected_type: The target type to load the objects into, e.g. list
147
+ :param header_comment_token: Token used for header comments to be skipped
148
+ :return: A list of dictionaries representing the loaded objects.
149
+ """
150
+ if isinstance(file_path, Path):
151
+ file_path = str(file_path)
152
+
153
+ if isinstance(format, str):
154
+ format = Format(format)
155
+
156
+ all_objects = []
157
+
158
+ if compression == "tgz":
159
+ with tarfile.open(file_path, "r:gz") as tar:
160
+ for member in tar.getmembers():
161
+ if member.isfile():
162
+ f = tar.extractfile(member)
163
+ if f:
164
+ content = io.TextIOWrapper(f)
165
+ member_format = Format.guess_format(member.name) if not format else format
166
+ logger.debug(f"Processing tar member {member.name} with format {member_format}")
167
+ all_objects.extend(process_file(content, member_format, expected_type, header_comment_token))
168
+ else:
169
+ if Path(file_path).is_dir():
170
+ raise ValueError(f"{file_path} is a dir, which is invalid for {format}")
171
+ mode = "rb" if format == Format.PARQUET or compression == "gz" else "r"
172
+ open_func = gzip.open if compression == "gz" else open
173
+ format = Format.guess_format(file_path) if not format else format
174
+ with open_func(file_path, mode) if file_path != "-" else sys.stdin as f:
175
+ if compression == "gz" and mode == "r":
176
+ f = io.TextIOWrapper(f)
177
+ all_objects = process_file(f, format, expected_type, header_comment_token)
178
+
179
+ logger.debug(f"Loaded {len(all_objects)} objects from {file_path}")
180
+ return all_objects
181
+
182
+
183
+ def xxxload_objects(
56
184
  file_path: Union[str, Path],
57
185
  format: Union[Format, str] = None,
186
+ compression: Optional[str] = None,
58
187
  expected_type: Type = None,
59
188
  header_comment_token: Optional[str] = None,
60
189
  ) -> List[Dict[str, Any]]:
@@ -172,7 +301,7 @@ def write_output(
172
301
 
173
302
 
174
303
  def render_output(
175
- data: Union[List[Dict[str, Any]], Dict[str, Any], pd.DataFrame], format: Union[Format, str] = Format.YAML
304
+ data: Union[List[Dict[str, Any]], Dict[str, Any], pd.DataFrame], format: Optional[Union[Format, str]] = Format.YAML
176
305
  ) -> str:
177
306
  """
178
307
  Render output data in JSON, JSONLines, YAML, CSV, or TSV format.
@@ -271,15 +400,4 @@ def guess_format(path: str) -> Optional[Format]:
271
400
  :param path: The path to the file.
272
401
  :return: The guessed format.
273
402
  """
274
- if path.endswith(".json"):
275
- return Format.JSON
276
- elif path.endswith(".jsonl"):
277
- return Format.JSONL
278
- elif path.endswith(".yaml") or path.endswith(".yml"):
279
- return Format.YAML
280
- elif path.endswith(".tsv"):
281
- return Format.TSV
282
- elif path.endswith(".csv"):
283
- return Format.CSV
284
- else:
285
- return None
403
+ return Format.guess_format(path)
@@ -0,0 +1,145 @@
1
+ import logging
2
+ import os
3
+ import subprocess
4
+ from pathlib import Path
5
+ from typing import Optional
6
+ from urllib.parse import urlparse
7
+
8
+ from pymongo import MongoClient
9
+ from pymongo.database import Database
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ def extract_connection_info(db: Database):
15
+ client = db.client
16
+
17
+ # Get the host and port
18
+ host_info = client.address
19
+ if host_info:
20
+ host, port = host_info
21
+ else:
22
+ # For replica sets or sharded clusters, we might need to get this differently
23
+ host = client.HOST
24
+ port = client.PORT
25
+
26
+ # Get the database name
27
+ db_name = db.name
28
+
29
+ # Get username if available
30
+ username = None
31
+ if hasattr(client, "options") and hasattr(client.options, "credentials"):
32
+ credentials = client.options.credentials
33
+ if credentials:
34
+ username = credentials.username
35
+
36
+ return {"host": host, "port": port, "db_name": db_name, "username": username}
37
+
38
+
39
+ def get_connection_string(client: MongoClient):
40
+ """
41
+ Extract a connection string from the MongoClient.
42
+ This avoids triggering truth value testing on Database objects.
43
+ """
44
+ if client.address:
45
+ host, port = client.address
46
+ return f"{host}:{port}"
47
+ if hasattr(client, "address") and client.address:
48
+ host, port = client.address
49
+ return f"{host}:{port}"
50
+ elif client.hosts:
51
+ # For replica sets, return all hosts
52
+ return ",".join(f"{host}:{port}" for host, port in client.hosts)
53
+ elif hasattr(client, "HOST"):
54
+ # If we can't determine hosts, use the entire URI
55
+ parsed_uri = urlparse(client.HOST)
56
+ return f"{parsed_uri.hostname}:{parsed_uri.port}"
57
+ else:
58
+ raise ValueError("Unable to determine connection string from client")
59
+
60
+
61
+ def get_connection_info(db: Database):
62
+ """
63
+ Extract connection information from the Database object.
64
+ """
65
+ # Get the name of the database
66
+ db_name = db.name
67
+
68
+ # Get the client's node list (this should work for single nodes and replica sets)
69
+ node_list = db.client.nodes
70
+
71
+ if not node_list:
72
+ raise ValueError("Unable to determine connection information from database")
73
+
74
+ # Use the first node in the list (for single node setups, this will be the only node)
75
+ first_node = node_list[0]
76
+ host, port = first_node
77
+
78
+ return host, port, db_name
79
+
80
+
81
+ def get_auth_from_client(client: MongoClient):
82
+ """Extract authentication details from MongoClient."""
83
+ if hasattr(client, "_MongoClient__options"):
84
+ # For older versions of PyMongo
85
+ options = client._MongoClient__options
86
+ elif hasattr(client, "options"):
87
+ # For newer versions of PyMongo
88
+ options = client.options
89
+ else:
90
+ return None, None, None
91
+
92
+ if hasattr(options, "credentials"):
93
+ creds = options.credentials
94
+ return creds.username, creds.password, creds.source
95
+ return None, None, None
96
+
97
+
98
+ def connection_from_handle(handle: str):
99
+ if handle.startswith("mongodb://"):
100
+ handle = handle.replace("mongodb://", "")
101
+ host, db = handle.split("/")
102
+ return host, db
103
+
104
+
105
+ def export_mongodb(handle: str, location: str, password: Optional[str] = None):
106
+ host, db_name = connection_from_handle(handle)
107
+
108
+ # Construct the mongodump command
109
+ cmd = ["mongodump", f"--host={host}", f"--db={db_name}"]
110
+ logger.info(f"Exporting MongoDB database {db_name} from {host} to {location}")
111
+ cmd.extend(["--out", location])
112
+ result = subprocess.run(cmd, check=True, capture_output=True, text=True)
113
+ logger.info(f"MongoDB export completed successfully. Output: {result.stdout}")
114
+
115
+
116
+ def import_mongodb(handle: str, dump_dir: str, drop: bool = False):
117
+ host, db_name = connection_from_handle(handle)
118
+
119
+ # list dirs in dump_dir
120
+ dir_path = Path(dump_dir)
121
+ if not dir_path.is_dir():
122
+ raise ValueError(f"{dir_path} is not a dir")
123
+ directories = [name for name in os.listdir(dump_dir)]
124
+ if len(directories) != 1:
125
+ raise ValueError(f"Expected exactly one database in {dump_dir}, got: {directories}")
126
+ src_db_name = directories[0]
127
+
128
+ # Construct the mongorestore command
129
+ cmd = [
130
+ "mongorestore",
131
+ f"--host={host}",
132
+ f"--nsFrom={src_db_name}.*",
133
+ f"--nsTo={db_name}.*",
134
+ str(dump_dir),
135
+ ]
136
+
137
+ # Add drop option if specified
138
+ if drop:
139
+ cmd.append("--drop")
140
+ logger.info(f"CMD={cmd}")
141
+ # Execute mongorestore
142
+ result = subprocess.run(cmd, check=True, capture_output=True, text=True)
143
+ if result.stderr:
144
+ logger.warning(result.stderr)
145
+ logger.info(f"MongoDB import completed successfully. Output: {result.stdout} // {result.stderr}")
@@ -0,0 +1,42 @@
1
+ import networkx as nx
2
+ from py2neo import Graph
3
+
4
+
5
+ def draw_neo4j_graph(handle="bolt://localhost:7687", auth=("neo4j", None)):
6
+ # Connect to Neo4j
7
+ graph = Graph(handle, auth=auth)
8
+
9
+ # Run a Cypher query
10
+ query = """
11
+ MATCH (n)-[r]->(m)
12
+ RETURN n, r, m
13
+ LIMIT 100
14
+ """
15
+ result = graph.run(query)
16
+
17
+ # Create a NetworkX graph
18
+ G = nx.DiGraph() # Use DiGraph for directed edges
19
+ for record in result:
20
+ n = record["n"]
21
+ m = record["m"]
22
+ r = record["r"]
23
+ G.add_node(n["name"], label=list(n.labels or ["-"])[0])
24
+ G.add_node(m["name"], label=list(m.labels or ["-"])[0])
25
+ G.add_edge(n["name"], m["name"], type=type(r).__name__)
26
+
27
+ # Draw the graph
28
+ pos = nx.spring_layout(G)
29
+
30
+ # Draw nodes
31
+ nx.draw_networkx_nodes(G, pos, node_color="lightblue", node_size=10000)
32
+
33
+ # Draw edges
34
+ nx.draw_networkx_edges(G, pos, edge_color="gray", arrows=True)
35
+
36
+ # Add node labels
37
+ node_labels = nx.get_node_attributes(G, "label")
38
+ nx.draw_networkx_labels(G, pos, {node: f"{node}\n({label})" for node, label in node_labels.items()}, font_size=16)
39
+
40
+ # Add edge labels
41
+ edge_labels = nx.get_edge_attributes(G, "type")
42
+ nx.draw_networkx_edge_labels(G, pos, edge_labels, font_size=16)
@@ -66,9 +66,14 @@ def col_val_constraints_to_conjs(col_name: str, val_constraints: Any) -> list:
66
66
  conjs = []
67
67
  for k, v in val_constraints.items():
68
68
  if k in OP_MAP:
69
- conjs.append(f"{OP_MAP[k]}({col_name}, {_quote(v)})")
69
+ if k == "$in" and isinstance(v, list):
70
+ v_mapped = [_quote(v1) for v1 in v]
71
+ t = f"{col_name} IN ({', '.join(v_mapped)})"
72
+ else:
73
+ t = f"{OP_MAP[k]}({col_name}, {_quote(v)})"
70
74
  else:
71
- conjs.append(f"{col_name} {k} {_quote(v)}")
75
+ t = f"{col_name} {k} {_quote(v)}"
76
+ conjs.append(t)
72
77
  return conjs
73
78
  else:
74
79
  return [f"{col_name} = {_quote(val_constraints)}"]
@@ -1,5 +1,13 @@
1
1
  {% extends "base.html.j2" %}
2
2
 
3
+
4
+ {% macro make_link(link) %}
5
+ {{ link.rel }} [
6
+ page: <a href="/pages{{ link.href }}">/pages{{ link.href }}</a> |
7
+ API: <a href="{{ link.href }}">{{ link.href }}</a> ]
8
+ <a href="{{ href }}">{{ rel }}</a>
9
+ {% endmacro %}
10
+
3
11
  {% block title %}{meta.path}{% endblock %}
4
12
 
5
13
  {% block content %}
@@ -9,38 +17,27 @@
9
17
  </pre>
10
18
 
11
19
  <h1>Links</h1>
12
- <ul>
13
- {% for link in response.links %}
14
- <li>
15
- <a href="/pages{{ link.href }}">{{ link.rel }} ({{ link.href }})</a>
16
- </li>
17
- {% endfor %}
20
+ <ul>
21
+ {% for link in response.links %}
22
+ <li> {{ make_link(link) }} </li>
23
+ {% endfor %}
18
24
  </ul>
19
- </ul>
20
25
 
26
+ {% if response.items != None and response["items"] != None %}
27
+ <h1>Items</h1>
28
+ <ul>
29
+ {% for item in response["items"] %}
30
+ <li>
31
+ {{ item.name }}
32
+ {% for link in item.links %}
33
+ {{ make_link(link) }}
34
+ {% endfor %}
35
+ HTML: {{ item.html | safe }}
36
+ </li>
37
+ {% endfor %}
38
+ {% endif %}
21
39
  <h1>Data</h1>
22
- {% if data_html %}
23
- <ul>
24
- {% for e in data_html %}
25
- <li>{{ e|safe }}</li>
26
- {% endfor %}
27
- </ul>
28
- {% else %}
29
-
30
- {% if "items" in response.data %}
31
- <ul>
32
- {% for item in response.data['items'] %}
33
- <li>
34
- {{ item.name }}
35
- {% for link in item.links %}
36
- <a href="/pages{{ link.href }}">{{ link.rel }}</a>
37
- {% endfor %}
38
- </li>
39
- {% endfor %}
40
- </ul>
41
- {% endif %}
42
40
  <pre>
43
41
  {{ response.data }}
44
42
  </pre>
45
- {% endif %}
46
43
  {% endblock %}