graflo 1.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of graflo might be problematic. Click here for more details.

Files changed (70) hide show
  1. graflo/README.md +18 -0
  2. graflo/__init__.py +70 -0
  3. graflo/architecture/__init__.py +38 -0
  4. graflo/architecture/actor.py +1276 -0
  5. graflo/architecture/actor_util.py +450 -0
  6. graflo/architecture/edge.py +418 -0
  7. graflo/architecture/onto.py +376 -0
  8. graflo/architecture/onto_sql.py +54 -0
  9. graflo/architecture/resource.py +163 -0
  10. graflo/architecture/schema.py +135 -0
  11. graflo/architecture/transform.py +292 -0
  12. graflo/architecture/util.py +89 -0
  13. graflo/architecture/vertex.py +562 -0
  14. graflo/caster.py +736 -0
  15. graflo/cli/__init__.py +14 -0
  16. graflo/cli/ingest.py +203 -0
  17. graflo/cli/manage_dbs.py +197 -0
  18. graflo/cli/plot_schema.py +132 -0
  19. graflo/cli/xml2json.py +93 -0
  20. graflo/data_source/__init__.py +48 -0
  21. graflo/data_source/api.py +339 -0
  22. graflo/data_source/base.py +95 -0
  23. graflo/data_source/factory.py +304 -0
  24. graflo/data_source/file.py +148 -0
  25. graflo/data_source/memory.py +70 -0
  26. graflo/data_source/registry.py +82 -0
  27. graflo/data_source/sql.py +183 -0
  28. graflo/db/__init__.py +44 -0
  29. graflo/db/arango/__init__.py +22 -0
  30. graflo/db/arango/conn.py +1025 -0
  31. graflo/db/arango/query.py +180 -0
  32. graflo/db/arango/util.py +88 -0
  33. graflo/db/conn.py +377 -0
  34. graflo/db/connection/__init__.py +6 -0
  35. graflo/db/connection/config_mapping.py +18 -0
  36. graflo/db/connection/onto.py +717 -0
  37. graflo/db/connection/wsgi.py +29 -0
  38. graflo/db/manager.py +119 -0
  39. graflo/db/neo4j/__init__.py +16 -0
  40. graflo/db/neo4j/conn.py +639 -0
  41. graflo/db/postgres/__init__.py +37 -0
  42. graflo/db/postgres/conn.py +948 -0
  43. graflo/db/postgres/fuzzy_matcher.py +281 -0
  44. graflo/db/postgres/heuristics.py +133 -0
  45. graflo/db/postgres/inference_utils.py +428 -0
  46. graflo/db/postgres/resource_mapping.py +273 -0
  47. graflo/db/postgres/schema_inference.py +372 -0
  48. graflo/db/postgres/types.py +148 -0
  49. graflo/db/postgres/util.py +87 -0
  50. graflo/db/tigergraph/__init__.py +9 -0
  51. graflo/db/tigergraph/conn.py +2365 -0
  52. graflo/db/tigergraph/onto.py +26 -0
  53. graflo/db/util.py +49 -0
  54. graflo/filter/__init__.py +21 -0
  55. graflo/filter/onto.py +525 -0
  56. graflo/logging.conf +22 -0
  57. graflo/onto.py +312 -0
  58. graflo/plot/__init__.py +17 -0
  59. graflo/plot/plotter.py +616 -0
  60. graflo/util/__init__.py +23 -0
  61. graflo/util/chunker.py +807 -0
  62. graflo/util/merge.py +150 -0
  63. graflo/util/misc.py +37 -0
  64. graflo/util/onto.py +422 -0
  65. graflo/util/transform.py +454 -0
  66. graflo-1.3.7.dist-info/METADATA +243 -0
  67. graflo-1.3.7.dist-info/RECORD +70 -0
  68. graflo-1.3.7.dist-info/WHEEL +4 -0
  69. graflo-1.3.7.dist-info/entry_points.txt +5 -0
  70. graflo-1.3.7.dist-info/licenses/LICENSE +126 -0
graflo/cli/__init__.py ADDED
@@ -0,0 +1,14 @@
1
+ """Command-line interface for graflo.
2
+
3
+ This module provides command-line tools for working with graflo, including
4
+ utilities for data ingestion, schema management, and graph operations.
5
+
6
+ Key Components:
7
+ - Command-line tools for data processing
8
+ - Schema management utilities
9
+ - Graph database operations
10
+
11
+ Example:
12
+ >>> uv run ingest --config config.json --data data.json
13
+ >>> uv run plot_schema --config schema.yaml --output figs
14
+ """
graflo/cli/ingest.py ADDED
@@ -0,0 +1,203 @@
1
+ """Data ingestion command-line interface for graph databases.
2
+
3
+ This module provides a CLI tool for ingesting data into graph databases. It supports
4
+ batch processing, parallel execution, and various data formats. The tool can handle
5
+ both initial database setup and incremental data ingestion.
6
+
7
+ Key Features:
8
+ - Configurable batch processing
9
+ - Multi-core and multi-threaded execution
10
+ - Support for custom resource patterns
11
+ - Database initialization and cleanup options
12
+ - Flexible file discovery and processing
13
+
14
+ Example:
15
+ $ uv run ingest \\
16
+ --db-config-path config/db.yaml \\
17
+ --schema-path config/schema.yaml \\
18
+ --source-path data/ \\
19
+ --batch-size 5000 \\
20
+ --n-cores 4
21
+ """
22
+
23
+ import logging.config
24
+ import pathlib
25
+ from os.path import dirname, join, realpath
26
+
27
+ import click
28
+ from suthing import FileHandle
29
+
30
+ from graflo import Caster, DataSourceRegistry, Patterns, Schema
31
+ from graflo.db.connection.onto import DBConfig
32
+ from graflo.data_source import DataSourceFactory
33
+
34
+ logger = logging.getLogger(__name__)
35
+
36
+
37
+ @click.command()
38
+ @click.option(
39
+ "--db-config-path",
40
+ type=click.Path(exists=True, path_type=pathlib.Path),
41
+ required=True,
42
+ )
43
+ @click.option(
44
+ "--schema-path",
45
+ type=click.Path(exists=True, path_type=pathlib.Path),
46
+ required=True,
47
+ )
48
+ @click.option(
49
+ "--source-path",
50
+ type=click.Path(exists=True, path_type=pathlib.Path),
51
+ required=False,
52
+ help="Path to source data directory (required if not using --data-source-config-path)",
53
+ )
54
+ @click.option(
55
+ "--resource-pattern-config-path",
56
+ type=click.Path(exists=True, path_type=pathlib.Path),
57
+ default=None,
58
+ )
59
+ @click.option(
60
+ "--data-source-config-path",
61
+ type=click.Path(exists=True, path_type=pathlib.Path),
62
+ default=None,
63
+ help="Path to data source configuration file (supports API, SQL, file sources)",
64
+ )
65
+ @click.option("--limit-files", type=int, default=None)
66
+ @click.option("--batch-size", type=int, default=5000)
67
+ @click.option("--n-cores", type=int, default=1)
68
+ @click.option(
69
+ "--n-threads",
70
+ type=int,
71
+ default=1,
72
+ )
73
+ @click.option("--fresh-start", type=bool, help="wipe existing database")
74
+ @click.option(
75
+ "--init-only",
76
+ default=False,
77
+ is_flag=True,
78
+ help="skip ingestion; only init the db",
79
+ )
80
+ def ingest(
81
+ db_config_path,
82
+ schema_path,
83
+ source_path,
84
+ limit_files,
85
+ batch_size,
86
+ n_cores,
87
+ fresh_start,
88
+ init_only,
89
+ resource_pattern_config_path,
90
+ data_source_config_path,
91
+ ):
92
+ """Ingest data into a graph database.
93
+
94
+ This command processes data files and ingests them into a graph database according
95
+ to the provided schema. It supports various configuration options for controlling
96
+ the ingestion process.
97
+
98
+ Args:
99
+ db_config_path: Path to database configuration file
100
+ schema_path: Path to schema configuration file
101
+ source_path: Path to source data directory
102
+ limit_files: Optional limit on number of files to process
103
+ batch_size: Number of items to process in each batch (default: 5000)
104
+ n_cores: Number of CPU cores/threads to use for parallel processing (default: 1)
105
+ fresh_start: Whether to wipe existing database before ingestion
106
+ init_only: Whether to only initialize the database without ingestion
107
+ resource_pattern_config_path: Optional path to resource pattern configuration
108
+
109
+ Example:
110
+ $ uv run ingest \\
111
+ --db-config-path config/db.yaml \\
112
+ --schema-path config/schema.yaml \\
113
+ --source-path data/ \\
114
+ --batch-size 5000 \\
115
+ --n-cores 4 \\
116
+ --fresh-start
117
+ """
118
+ cdir = dirname(realpath(__file__))
119
+
120
+ logging.config.fileConfig(
121
+ join(cdir, "../logging.conf"), disable_existing_loggers=False
122
+ )
123
+
124
+ logging.basicConfig(level=logging.INFO)
125
+
126
+ schema = Schema.from_dict(FileHandle.load(schema_path))
127
+
128
+ # Load config from file
129
+ config_data = FileHandle.load(db_config_path)
130
+ conn_conf = DBConfig.from_dict(config_data)
131
+
132
+ if resource_pattern_config_path is not None:
133
+ patterns = Patterns.from_dict(FileHandle.load(resource_pattern_config_path))
134
+ else:
135
+ patterns = Patterns()
136
+
137
+ schema.fetch_resource()
138
+
139
+ # Create ingestion params with CLI arguments
140
+ from graflo.caster import IngestionParams
141
+
142
+ ingestion_params = IngestionParams(
143
+ n_cores=n_cores,
144
+ )
145
+
146
+ caster = Caster(
147
+ schema,
148
+ ingestion_params=ingestion_params,
149
+ )
150
+
151
+ # Validate that either source_path or data_source_config_path is provided
152
+ if data_source_config_path is None and source_path is None:
153
+ raise click.UsageError(
154
+ "Either --source-path or --data-source-config-path must be provided"
155
+ )
156
+
157
+ # Check if data source config is provided (for API, SQL, etc.)
158
+ if data_source_config_path is not None:
159
+ # Load data source configuration
160
+ data_source_config = FileHandle.load(data_source_config_path)
161
+ registry = DataSourceRegistry()
162
+
163
+ # Register data sources from config
164
+ # Config format: {"data_sources": [{"source_type": "...", "resource_name": "...", ...}]}
165
+ if "data_sources" in data_source_config:
166
+ for ds_config in data_source_config["data_sources"]:
167
+ ds_config_copy = ds_config.copy()
168
+ resource_name = ds_config_copy.pop("resource_name")
169
+ source_type = ds_config_copy.pop("source_type", None)
170
+
171
+ # Create data source using factory
172
+ data_source = DataSourceFactory.create_data_source(
173
+ source_type=source_type, **ds_config_copy
174
+ )
175
+ registry.register(data_source, resource_name=resource_name)
176
+
177
+ # Update ingestion params with runtime options
178
+ ingestion_params.clean_start = fresh_start
179
+ ingestion_params.batch_size = batch_size
180
+ ingestion_params.init_only = init_only
181
+
182
+ caster.ingest_data_sources(
183
+ data_source_registry=registry,
184
+ conn_conf=conn_conf,
185
+ ingestion_params=ingestion_params,
186
+ )
187
+ else:
188
+ # Fall back to file-based ingestion
189
+ # Update ingestion params with runtime options
190
+ ingestion_params.clean_start = fresh_start
191
+ ingestion_params.batch_size = batch_size
192
+ ingestion_params.init_only = init_only
193
+ ingestion_params.limit_files = limit_files
194
+
195
+ caster.ingest(
196
+ output_config=conn_conf,
197
+ patterns=patterns,
198
+ ingestion_params=ingestion_params,
199
+ )
200
+
201
+
202
+ if __name__ == "__main__":
203
+ ingest()
@@ -0,0 +1,197 @@
1
+ """Database management utilities for ArangoDB.
2
+
3
+ This module provides command-line tools for managing ArangoDB databases, including
4
+ backup and restore operations. It supports both local and Docker-based operations.
5
+
6
+ Key Features:
7
+ - Database backup and restore
8
+ - Docker and local execution modes
9
+ - Configurable connection settings
10
+ - Batch processing of multiple databases
11
+
12
+ Example:
13
+ $ uv run manage_dbs \\
14
+ --db-config-path config/db.yaml \\
15
+ --db mydb1 mydb2 \\
16
+ --store-directory-path /backups \\
17
+ --use-docker
18
+ """
19
+
20
+ import logging
21
+ import pathlib
22
+ import subprocess
23
+ import sys
24
+ from datetime import date
25
+
26
+ import click
27
+ from suthing import FileHandle, Timer
28
+
29
+ from graflo.db.connection.onto import ArangoConfig, DBConfig
30
+
31
+ logger = logging.getLogger(__name__)
32
+
33
+
34
+ def act_db(
35
+ conf: ArangoConfig,
36
+ db_name: str,
37
+ output_path: pathlib.Path,
38
+ restore: bool,
39
+ docker_version: str,
40
+ use_docker: bool,
41
+ ):
42
+ """Execute database backup or restore operation.
43
+
44
+ This function performs either a backup (arangodump) or restore (arangorestore)
45
+ operation on an ArangoDB database. It can use either the local arangodump/arangorestore
46
+ tools or run them in a Docker container.
47
+
48
+ Args:
49
+ conf: Database connection configuration
50
+ db_name: Name of the database to backup/restore
51
+ output_path: Path where backup will be stored or restored from
52
+ restore: Whether to restore (True) or backup (False)
53
+ docker_version: Version of ArangoDB Docker image to use
54
+ use_docker: Whether to use Docker for the operation
55
+
56
+ Returns:
57
+ None
58
+
59
+ Raises:
60
+ subprocess.CalledProcessError: If the backup/restore operation fails
61
+ """
62
+ host = f"tcp://{conf.hostname}:{conf.port}"
63
+ db_folder = output_path / db_name
64
+
65
+ cmd = "arangorestore" if restore else "arangodump"
66
+ if use_docker:
67
+ ru = (
68
+ f"docker run --rm --network=host -v {db_folder}:/dump"
69
+ f" arangodb/arangodb:{docker_version} {cmd}"
70
+ )
71
+ output = "--output-directory /dump"
72
+ else:
73
+ ru = f"{cmd}"
74
+ output = f"--output-directory {db_folder}"
75
+
76
+ dir_spec = "input" if restore else "output"
77
+
78
+ query = f"""{ru} --server.endpoint {host} --server.username {conf.username} --server.password "{conf.password}" --{dir_spec}-directory {output} --server.database "{db_name}" """
79
+
80
+ restore_suffix = "--create-database true --force-same-database true"
81
+ if restore:
82
+ query += restore_suffix
83
+ else:
84
+ query += "--overwrite true"
85
+
86
+ flag = subprocess.run(query, shell=True)
87
+ logger.info(f"returned {flag}")
88
+
89
+
90
+ @click.command()
91
+ @click.option(
92
+ "--db-config-path",
93
+ type=click.Path(exists=True, path_type=pathlib.Path),
94
+ required=False,
95
+ default=None,
96
+ )
97
+ @click.option("--db-host", type=str)
98
+ @click.option("--db-password", type=str)
99
+ @click.option("--db-user", type=str, default="root")
100
+ @click.option(
101
+ "--db",
102
+ type=str,
103
+ multiple=True,
104
+ required=True,
105
+ help="filesystem path where to dump db snapshot",
106
+ )
107
+ @click.option(
108
+ "--store-directory-path",
109
+ type=click.Path(path_type=pathlib.Path),
110
+ required=True,
111
+ help="filesystem path where to dump db snapshot",
112
+ )
113
+ @click.option("--docker-version", type=str, default="3.12.1")
114
+ @click.option("--restore", type=bool, default=False, is_flag=True)
115
+ @click.option("--use-docker", type=bool, default=True)
116
+ def manage_dbs(
117
+ db_config_path,
118
+ db_host,
119
+ db_password,
120
+ db_user,
121
+ db,
122
+ store_directory_path,
123
+ restore,
124
+ docker_version,
125
+ use_docker=True,
126
+ ):
127
+ """Manage ArangoDB database backups and restores.
128
+
129
+ This command provides functionality to backup and restore ArangoDB databases.
130
+ It supports both local execution and Docker-based operations. The command can
131
+ process multiple databases in sequence and provides timing information for
132
+ each operation.
133
+
134
+ Args:
135
+ db_config_path: Path to database configuration file (optional)
136
+ db_host: Database host address (if not using config file)
137
+ db_password: Database password (if not using config file)
138
+ db_user: Database username (default: root)
139
+ db: List of database names to process
140
+ store_directory_path: Path where backups will be stored/restored
141
+ restore: Whether to restore (True) or backup (False)
142
+ docker_version: Version of ArangoDB Docker image (default: 3.12.1)
143
+ use_docker: Whether to use Docker for operations (default: True)
144
+
145
+ Example:
146
+ $ uv run manage_dbs \\
147
+ --db-config-path config/db.yaml \\
148
+ --db mydb1 mydb2 \\
149
+ --store-directory-path /backups \\
150
+ --use-docker
151
+ """
152
+ if db_config_path is None:
153
+ # Construct URI from host
154
+ uri = db_host if db_host and "://" in db_host else f"http://{db_host}"
155
+ db_conf = ArangoConfig(uri=uri, username=db_user, password=db_password)
156
+ else:
157
+ conn_conf = FileHandle.load(fpath=db_config_path)
158
+ db_conf_raw = DBConfig.from_dict(conn_conf)
159
+ # Type checker can't infer the specific type, but we know it's ArangoConfig from the config
160
+ if not isinstance(db_conf_raw, ArangoConfig):
161
+ raise ValueError(f"Expected ArangoConfig, got {type(db_conf_raw)}")
162
+ db_conf: ArangoConfig = db_conf_raw
163
+
164
+ action = "restoring" if restore else "dumping"
165
+ if restore:
166
+ out_path = store_directory_path
167
+ else:
168
+ out_path = (
169
+ store_directory_path.expanduser().resolve() / date.today().isoformat()
170
+ )
171
+
172
+ if not out_path.exists():
173
+ out_path.mkdir(exist_ok=True)
174
+
175
+ with Timer() as t_all:
176
+ for dbname in db:
177
+ with Timer() as t_dump:
178
+ try:
179
+ act_db(
180
+ db_conf,
181
+ dbname,
182
+ out_path,
183
+ restore=restore,
184
+ docker_version=docker_version,
185
+ use_docker=use_docker,
186
+ )
187
+ except Exception as e:
188
+ logging.error(e)
189
+ logging.info(
190
+ f"{action} {dbname} took {t_dump.mins} mins {t_dump.secs:.2f} sec"
191
+ )
192
+ logging.info(f"all {action} took {t_all.mins} mins {t_all.secs:.2f} sec")
193
+
194
+
195
+ if __name__ == "__main__":
196
+ logging.basicConfig(level=logging.INFO, stream=sys.stdout)
197
+ manage_dbs()
@@ -0,0 +1,132 @@
1
+ """Schema visualization tool for graph databases.
2
+
3
+ This module provides functionality for visualizing graph database schemas using Graphviz.
4
+ It includes tools for plotting vertex-to-vertex relationships, vertex fields, and resource
5
+ mappings. The module supports various visualization options and graph layout customization.
6
+
7
+ Key Components:
8
+ - SchemaPlotter: Main class for schema visualization
9
+ - knapsack: Utility for optimizing graph layout
10
+ - plot_schema: CLI command for schema visualization
11
+
12
+ Graphviz Attributes Reference:
13
+ - https://renenyffenegger.ch/notes/tools/Graphviz/attributes/index
14
+ - https://rsms.me/graphviz/
15
+ - https://graphviz.readthedocs.io/en/stable/examples.html
16
+ - https://graphviz.org/doc/info/attrs.html
17
+
18
+ Example:
19
+ >>> plot_schema(schema_path="schema.yaml", figure_output_path="schema.png")
20
+ """
21
+
22
+ import logging
23
+ import sys
24
+
25
+ import click
26
+
27
+ from graflo.plot.plotter import SchemaPlotter
28
+
29
+ """
30
+
31
+ graphviz attributes
32
+
33
+ https://renenyffenegger.ch/notes/tools/Graphviz/attributes/index
34
+ https://rsms.me/graphviz/
35
+ https://graphviz.readthedocs.io/en/stable/examples.html
36
+ https://graphviz.org/doc/info/attrs.html
37
+
38
+ usage:
39
+ color='red',style='filled', fillcolor='blue',shape='square'
40
+
41
+ to keep
42
+ level_one = [node1, node2]
43
+ sg_one = ag.add_subgraph(level_one, rank='same')
44
+
45
+ """
46
+
47
+
48
+ def knapsack(weights, ks_size=7):
49
+ """Split a set of weights into groups of at most threshold weight.
50
+
51
+ This function implements a greedy algorithm to partition weights into groups
52
+ where each group's total weight is at most ks_size. It's used for optimizing
53
+ graph layout by balancing node distribution.
54
+
55
+ Args:
56
+ weights: List of weights to partition
57
+ ks_size: Maximum total weight per group (default: 7)
58
+
59
+ Returns:
60
+ list[list[int]]: List of groups, where each group is a list of indices
61
+ from the original weights list
62
+
63
+ Raises:
64
+ ValueError: If any single weight exceeds ks_size
65
+
66
+ Example:
67
+ >>> weights = [3, 4, 2, 5, 1]
68
+ >>> knapsack(weights, ks_size=7)
69
+ [[4, 0, 2], [1, 3]] # Groups with weights [6, 7]
70
+ """
71
+ pp = sorted(list(zip(range(len(weights)), weights)), key=lambda x: x[1])
72
+ print(pp)
73
+ acc = []
74
+ if pp[-1][1] > ks_size:
75
+ raise ValueError("One of the items is larger than the knapsack")
76
+
77
+ while pp:
78
+ w_item = []
79
+ w_item += [pp.pop()]
80
+ ww_item = sum([item for _, item in w_item])
81
+ while ww_item < ks_size:
82
+ cnt = 0
83
+ for j, item in enumerate(pp[::-1]):
84
+ diff = ks_size - item[1] - ww_item
85
+ if diff >= 0:
86
+ cnt += 1
87
+ w_item += [pp.pop(len(pp) - j - 1)]
88
+ ww_item += w_item[-1][1]
89
+ else:
90
+ break
91
+ if ww_item >= ks_size or cnt == 0:
92
+ acc += [w_item]
93
+ break
94
+ acc_ret = [[y for y, _ in subitem] for subitem in acc]
95
+ return acc_ret
96
+
97
+
98
+ @click.command()
99
+ @click.option("-c", "--schema-path", type=click.Path(), required=True)
100
+ @click.option("-o", "--figure-output-path", type=click.Path(), required=True)
101
+ @click.option("-p", "--prune-low-degree-nodes", type=bool, default=False)
102
+ def plot_schema(schema_path, figure_output_path, prune_low_degree_nodes):
103
+ """Generate visualizations of the graph database schema.
104
+
105
+ This command creates multiple visualizations of the schema:
106
+ 1. Vertex-to-vertex relationships
107
+ 2. Vertex fields and their relationships
108
+ 3. Resource mappings
109
+
110
+ The visualizations are saved to the specified output path.
111
+
112
+ Args:
113
+ schema_path: Path to the schema configuration file
114
+ figure_output_path: Path where the visualization will be saved
115
+ prune_low_degree_nodes: Whether to remove nodes with low connectivity
116
+ from the visualization (default: False)
117
+
118
+ Example:
119
+ $ uv run plot_schema -c schema.yaml -o schema.png
120
+ """
121
+ logging.basicConfig(level=logging.INFO, stream=sys.stdout)
122
+
123
+ plotter = SchemaPlotter(schema_path, figure_output_path)
124
+ plotter.plot_vc2vc(prune_leaves=prune_low_degree_nodes)
125
+ plotter.plot_vc2fields()
126
+ plotter.plot_resources()
127
+ # plotter.plot_source2vc()
128
+ # plotter.plot_source2vc_detailed()
129
+
130
+
131
+ if __name__ == "__main__":
132
+ plot_schema()
graflo/cli/xml2json.py ADDED
@@ -0,0 +1,93 @@
1
+ """XML to JSON conversion tool for data preprocessing.
2
+
3
+ This module provides a command-line tool for converting XML files to JSON format,
4
+ with support for different data sources and chunking options. It's particularly
5
+ useful for preprocessing scientific literature data from sources like Web of Science
6
+ and PubMed.
7
+
8
+ Key Features:
9
+ - Support for Web of Science and PubMed XML formats
10
+ - Configurable chunking for large files
11
+ - Batch processing of multiple files
12
+ - Customizable output format
13
+
14
+ Example:
15
+ $ uv run xml2json \\
16
+ --source-path data/wos.xml \\
17
+ --chunk-size 1000 \\
18
+ --mode wos_csv
19
+ """
20
+
21
+ import logging
22
+ import pathlib
23
+ import sys
24
+
25
+ import click
26
+
27
+ from graflo.util.chunker import convert, force_list_wos, tag_wos
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+
32
+ @click.command()
33
+ @click.option(
34
+ "-s",
35
+ "--source-path",
36
+ type=click.Path(path_type=pathlib.Path),
37
+ required=True,
38
+ )
39
+ @click.option("-c", "--chunk-size", type=int, default=1000)
40
+ @click.option("-m", "--max-chunks", type=int, default=None)
41
+ @click.option("--mode", type=str)
42
+ def do(source_path, chunk_size, max_chunks, mode):
43
+ """Convert XML files to JSON format.
44
+
45
+ This command processes XML files and converts them to JSON format, with support
46
+ for different data sources and chunking options.
47
+
48
+ Args:
49
+ source_path: Path to source XML file or directory
50
+ chunk_size: Number of records per output file (default: 1000)
51
+ max_chunks: Maximum number of chunks to process (default: None)
52
+ mode: Data source mode ('wos_csv' or 'pubmed')
53
+
54
+ Example:
55
+ $ uv run xml2json \\
56
+ --source-path data/wos.xml \\
57
+ --chunk-size 1000 \\
58
+ --mode wos_csv
59
+ """
60
+ if mode == "wos_csv":
61
+ pattern = r"xmlns=\".*[^\"]\"(?=>)"
62
+ force_list = force_list_wos
63
+ tag = tag_wos
64
+ elif mode == "pubmed":
65
+ pattern = None
66
+ force_list = None
67
+ tag = "PubmedArticle"
68
+ else:
69
+ raise ValueError(f"Unknown mode {mode}")
70
+
71
+ if source_path.is_dir():
72
+ files = [
73
+ fp for fp in source_path.iterdir() if not fp.is_dir() and "xml" in fp.name
74
+ ]
75
+ else:
76
+ files = [source_path] if ".xml." in source_path.name else []
77
+ for fp in files:
78
+ target_root = str(fp.parent / fp.name.split(".")[0])
79
+
80
+ convert(
81
+ fp,
82
+ target_root=target_root,
83
+ chunk_size=chunk_size,
84
+ max_chunks=max_chunks,
85
+ pattern=pattern,
86
+ force_list=force_list,
87
+ root_tag=tag,
88
+ )
89
+
90
+
91
+ if __name__ == "__main__":
92
+ logging.basicConfig(level=logging.INFO, stream=sys.stdout)
93
+ do()
@@ -0,0 +1,48 @@
1
+ """Data source abstraction layer for graph database ingestion.
2
+
3
+ This package provides a unified interface for different data source types,
4
+ separating "where data comes from" (DataSource) from "how it's transformed" (Resource).
5
+
6
+ Key Components:
7
+ - AbstractDataSource: Base class for all data sources
8
+ - FileDataSource: File-based data sources (JSON, JSONL, CSV/TSV)
9
+ - APIDataSource: REST API data source
10
+ - SQLDataSource: SQL database data source
11
+ - DataSourceRegistry: Maps DataSources to Resource names
12
+
13
+ Example:
14
+ >>> from graflo.data_source import FileDataSource, DataSourceRegistry
15
+ >>> source = FileDataSource(path="data.json", file_type="json")
16
+ >>> registry = DataSourceRegistry()
17
+ >>> registry.register(source, resource_name="users")
18
+ """
19
+
20
+ from .api import APIConfig, APIDataSource, PaginationConfig
21
+ from .base import AbstractDataSource, DataSourceType
22
+ from .factory import DataSourceFactory
23
+ from .file import (
24
+ FileDataSource,
25
+ JsonFileDataSource,
26
+ JsonlFileDataSource,
27
+ TableFileDataSource,
28
+ )
29
+ from .memory import InMemoryDataSource
30
+ from .registry import DataSourceRegistry
31
+ from .sql import SQLConfig, SQLDataSource
32
+
33
+ __all__ = [
34
+ "AbstractDataSource",
35
+ "APIConfig",
36
+ "APIDataSource",
37
+ "DataSourceFactory",
38
+ "DataSourceRegistry",
39
+ "DataSourceType",
40
+ "FileDataSource",
41
+ "InMemoryDataSource",
42
+ "JsonFileDataSource",
43
+ "JsonlFileDataSource",
44
+ "PaginationConfig",
45
+ "SQLConfig",
46
+ "SQLDataSource",
47
+ "TableFileDataSource",
48
+ ]