graflo 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of graflo might be problematic. Click here for more details.

Files changed (45) hide show
  1. graflo/README.md +18 -0
  2. graflo/__init__.py +39 -0
  3. graflo/architecture/__init__.py +37 -0
  4. graflo/architecture/actor.py +974 -0
  5. graflo/architecture/actor_util.py +425 -0
  6. graflo/architecture/edge.py +295 -0
  7. graflo/architecture/onto.py +374 -0
  8. graflo/architecture/resource.py +161 -0
  9. graflo/architecture/schema.py +136 -0
  10. graflo/architecture/transform.py +292 -0
  11. graflo/architecture/util.py +93 -0
  12. graflo/architecture/vertex.py +277 -0
  13. graflo/caster.py +409 -0
  14. graflo/cli/__init__.py +14 -0
  15. graflo/cli/ingest.py +144 -0
  16. graflo/cli/manage_dbs.py +193 -0
  17. graflo/cli/plot_schema.py +132 -0
  18. graflo/cli/xml2json.py +93 -0
  19. graflo/db/__init__.py +32 -0
  20. graflo/db/arango/__init__.py +16 -0
  21. graflo/db/arango/conn.py +734 -0
  22. graflo/db/arango/query.py +180 -0
  23. graflo/db/arango/util.py +88 -0
  24. graflo/db/connection.py +304 -0
  25. graflo/db/manager.py +104 -0
  26. graflo/db/neo4j/__init__.py +16 -0
  27. graflo/db/neo4j/conn.py +432 -0
  28. graflo/db/util.py +49 -0
  29. graflo/filter/__init__.py +21 -0
  30. graflo/filter/onto.py +400 -0
  31. graflo/logging.conf +22 -0
  32. graflo/onto.py +186 -0
  33. graflo/plot/__init__.py +17 -0
  34. graflo/plot/plotter.py +556 -0
  35. graflo/util/__init__.py +23 -0
  36. graflo/util/chunker.py +739 -0
  37. graflo/util/merge.py +148 -0
  38. graflo/util/misc.py +37 -0
  39. graflo/util/onto.py +63 -0
  40. graflo/util/transform.py +406 -0
  41. graflo-1.1.0.dist-info/METADATA +157 -0
  42. graflo-1.1.0.dist-info/RECORD +45 -0
  43. graflo-1.1.0.dist-info/WHEEL +4 -0
  44. graflo-1.1.0.dist-info/entry_points.txt +5 -0
  45. graflo-1.1.0.dist-info/licenses/LICENSE +126 -0
graflo/cli/ingest.py ADDED
@@ -0,0 +1,144 @@
1
+ """Data ingestion command-line interface for graph databases.
2
+
3
+ This module provides a CLI tool for ingesting data into graph databases. It supports
4
+ batch processing, parallel execution, and various data formats. The tool can handle
5
+ both initial database setup and incremental data ingestion.
6
+
7
+ Key Features:
8
+ - Configurable batch processing
9
+ - Multi-core and multi-threaded execution
10
+ - Support for custom resource patterns
11
+ - Database initialization and cleanup options
12
+ - Flexible file discovery and processing
13
+
14
+ Example:
15
+ $ uv run ingest \\
16
+ --db-config-path config/db.yaml \\
17
+ --schema-path config/schema.yaml \\
18
+ --source-path data/ \\
19
+ --batch-size 5000 \\
20
+ --n-cores 4
21
+ """
22
+
23
+ import logging.config
24
+ import pathlib
25
+ from os.path import dirname, join, realpath
26
+
27
+ import click
28
+ from suthing import ConfigFactory, FileHandle
29
+
30
+ from graflo import Caster, Patterns, Schema
31
+
32
+ logger = logging.getLogger(__name__)
33
+
34
+
35
+ @click.command()
36
+ @click.option(
37
+ "--db-config-path",
38
+ type=click.Path(exists=True, path_type=pathlib.Path),
39
+ required=True,
40
+ )
41
+ @click.option(
42
+ "--schema-path",
43
+ type=click.Path(exists=True, path_type=pathlib.Path),
44
+ required=True,
45
+ )
46
+ @click.option(
47
+ "--source-path",
48
+ type=click.Path(exists=True, path_type=pathlib.Path),
49
+ required=True,
50
+ )
51
+ @click.option(
52
+ "--resource-pattern-config-path",
53
+ type=click.Path(exists=True, path_type=pathlib.Path),
54
+ default=None,
55
+ )
56
+ @click.option("--limit-files", type=int, default=None)
57
+ @click.option("--batch-size", type=int, default=5000)
58
+ @click.option("--n-cores", type=int, default=1)
59
+ @click.option(
60
+ "--n-threads",
61
+ type=int,
62
+ default=1,
63
+ )
64
+ @click.option("--fresh-start", type=bool, help="wipe existing database")
65
+ @click.option(
66
+ "--init-only", default=False, is_flag=True, help="skip ingestion; only init the db"
67
+ )
68
+ def ingest(
69
+ db_config_path,
70
+ schema_path,
71
+ source_path,
72
+ limit_files,
73
+ batch_size,
74
+ n_cores,
75
+ n_threads,
76
+ fresh_start,
77
+ init_only,
78
+ resource_pattern_config_path,
79
+ ):
80
+ """Ingest data into a graph database.
81
+
82
+ This command processes data files and ingests them into a graph database according
83
+ to the provided schema. It supports various configuration options for controlling
84
+ the ingestion process.
85
+
86
+ Args:
87
+ db_config_path: Path to database configuration file
88
+ schema_path: Path to schema configuration file
89
+ source_path: Path to source data directory
90
+ limit_files: Optional limit on number of files to process
91
+ batch_size: Number of items to process in each batch (default: 5000)
92
+ n_cores: Number of CPU cores to use for parallel processing (default: 1)
93
+ n_threads: Number of threads per core for parallel processing (default: 1)
94
+ fresh_start: Whether to wipe existing database before ingestion
95
+ init_only: Whether to only initialize the database without ingestion
96
+ resource_pattern_config_path: Optional path to resource pattern configuration
97
+
98
+ Example:
99
+ $ uv run ingest \\
100
+ --db-config-path config/db.yaml \\
101
+ --schema-path config/schema.yaml \\
102
+ --source-path data/ \\
103
+ --batch-size 5000 \\
104
+ --n-cores 4 \\
105
+ --fresh-start
106
+ """
107
+ cdir = dirname(realpath(__file__))
108
+
109
+ logging.config.fileConfig(
110
+ join(cdir, "../logging.conf"), disable_existing_loggers=False
111
+ )
112
+
113
+ logging.basicConfig(level=logging.INFO)
114
+
115
+ schema = Schema.from_dict(FileHandle.load(schema_path))
116
+
117
+ conn_conf = ConfigFactory.create_config(db_config_path)
118
+
119
+ if resource_pattern_config_path is not None:
120
+ patterns = Patterns.from_dict(FileHandle.load(resource_pattern_config_path))
121
+ else:
122
+ patterns = Patterns()
123
+
124
+ schema.fetch_resource()
125
+
126
+ caster = Caster(
127
+ schema,
128
+ n_cores=n_cores,
129
+ n_threads=n_threads,
130
+ )
131
+
132
+ caster.ingest_files(
133
+ path=source_path,
134
+ limit_files=limit_files,
135
+ clean_start=fresh_start,
136
+ batch_size=batch_size,
137
+ conn_conf=conn_conf,
138
+ patterns=patterns,
139
+ init_only=init_only,
140
+ )
141
+
142
+
143
+ if __name__ == "__main__":
144
+ ingest()
@@ -0,0 +1,193 @@
1
+ """Database management utilities for ArangoDB.
2
+
3
+ This module provides command-line tools for managing ArangoDB databases, including
4
+ backup and restore operations. It supports both local and Docker-based operations.
5
+
6
+ Key Features:
7
+ - Database backup and restore
8
+ - Docker and local execution modes
9
+ - Configurable connection settings
10
+ - Batch processing of multiple databases
11
+
12
+ Example:
13
+ $ uv run manage_dbs \\
14
+ --db-config-path config/db.yaml \\
15
+ --db mydb1 mydb2 \\
16
+ --store-directory-path /backups \\
17
+ --use-docker
18
+ """
19
+
20
+ import logging
21
+ import pathlib
22
+ import subprocess
23
+ import sys
24
+ from datetime import date
25
+
26
+ import click
27
+ from suthing import ArangoConnectionConfig, ConfigFactory, FileHandle, Timer
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+
32
+ def act_db(
33
+ conf: ArangoConnectionConfig,
34
+ db_name: str,
35
+ output_path: pathlib.Path,
36
+ restore: bool,
37
+ docker_version: str,
38
+ use_docker: bool,
39
+ ):
40
+ """Execute database backup or restore operation.
41
+
42
+ This function performs either a backup (arangodump) or restore (arangorestore)
43
+ operation on an ArangoDB database. It can use either the local arangodump/arangorestore
44
+ tools or run them in a Docker container.
45
+
46
+ Args:
47
+ conf: Database connection configuration
48
+ db_name: Name of the database to backup/restore
49
+ output_path: Path where backup will be stored or restored from
50
+ restore: Whether to restore (True) or backup (False)
51
+ docker_version: Version of ArangoDB Docker image to use
52
+ use_docker: Whether to use Docker for the operation
53
+
54
+ Returns:
55
+ None
56
+
57
+ Raises:
58
+ subprocess.CalledProcessError: If the backup/restore operation fails
59
+ """
60
+ host = f"tcp://{conf.hostname}:{conf.port}"
61
+ db_folder = output_path / db_name
62
+
63
+ cmd = "arangorestore" if restore else "arangodump"
64
+ if use_docker:
65
+ ru = (
66
+ f"docker run --rm --network=host -v {db_folder}:/dump"
67
+ f" arangodb/arangodb:{docker_version} {cmd}"
68
+ )
69
+ output = "--output-directory /dump"
70
+ else:
71
+ ru = f"{cmd}"
72
+ output = f"--output-directory {db_folder}"
73
+
74
+ dir_spec = "input" if restore else "output"
75
+
76
+ query = f"""{ru} --server.endpoint {host} --server.username {conf.cred_name} --server.password "{conf.cred_pass}" --{dir_spec}-directory {output} --server.database "{db_name}" """
77
+
78
+ restore_suffix = "--create-database true --force-same-database true"
79
+ if restore:
80
+ query += restore_suffix
81
+ else:
82
+ query += "--overwrite true"
83
+
84
+ flag = subprocess.run(query, shell=True)
85
+ logger.info(f"returned {flag}")
86
+
87
+
88
+ @click.command()
89
+ @click.option(
90
+ "--db-config-path",
91
+ type=click.Path(exists=True, path_type=pathlib.Path),
92
+ required=False,
93
+ default=None,
94
+ )
95
+ @click.option("--db-host", type=str)
96
+ @click.option("--db-password", type=str)
97
+ @click.option("--db-user", type=str, default="root")
98
+ @click.option(
99
+ "--db",
100
+ type=str,
101
+ multiple=True,
102
+ required=True,
103
+ help="filesystem path where to dump db snapshot",
104
+ )
105
+ @click.option(
106
+ "--store-directory-path",
107
+ type=click.Path(path_type=pathlib.Path),
108
+ required=True,
109
+ help="filesystem path where to dump db snapshot",
110
+ )
111
+ @click.option("--docker-version", type=str, default="3.12.1")
112
+ @click.option("--restore", type=bool, default=False, is_flag=True)
113
+ @click.option("--use-docker", type=bool, default=True)
114
+ def manage_dbs(
115
+ db_config_path,
116
+ db_host,
117
+ db_password,
118
+ db_user,
119
+ db,
120
+ store_directory_path,
121
+ restore,
122
+ docker_version,
123
+ use_docker=True,
124
+ ):
125
+ """Manage ArangoDB database backups and restores.
126
+
127
+ This command provides functionality to backup and restore ArangoDB databases.
128
+ It supports both local execution and Docker-based operations. The command can
129
+ process multiple databases in sequence and provides timing information for
130
+ each operation.
131
+
132
+ Args:
133
+ db_config_path: Path to database configuration file (optional)
134
+ db_host: Database host address (if not using config file)
135
+ db_password: Database password (if not using config file)
136
+ db_user: Database username (default: root)
137
+ db: List of database names to process
138
+ store_directory_path: Path where backups will be stored/restored
139
+ restore: Whether to restore (True) or backup (False)
140
+ docker_version: Version of ArangoDB Docker image (default: 3.12.1)
141
+ use_docker: Whether to use Docker for operations (default: True)
142
+
143
+ Example:
144
+ $ uv run manage_dbs \\
145
+ --db-config-path config/db.yaml \\
146
+ --db mydb1 mydb2 \\
147
+ --store-directory-path /backups \\
148
+ --use-docker
149
+ """
150
+ if db_config_path is None:
151
+ db_conf: ArangoConnectionConfig = ArangoConnectionConfig(
152
+ cred_name=db_user, cred_pass=db_password, hosts=db_host
153
+ )
154
+ else:
155
+ conn_conf = FileHandle.load(fpath=db_config_path)
156
+ db_conf: ArangoConnectionConfig = ConfigFactory.create_config(
157
+ dict_like=conn_conf
158
+ )
159
+
160
+ action = "restoring" if restore else "dumping"
161
+ if restore:
162
+ out_path = store_directory_path
163
+ else:
164
+ out_path = (
165
+ store_directory_path.expanduser().resolve() / date.today().isoformat()
166
+ )
167
+
168
+ if not out_path.exists():
169
+ out_path.mkdir(exist_ok=True)
170
+
171
+ with Timer() as t_all:
172
+ for dbname in db:
173
+ with Timer() as t_dump:
174
+ try:
175
+ act_db(
176
+ db_conf,
177
+ dbname,
178
+ out_path,
179
+ restore=restore,
180
+ docker_version=docker_version,
181
+ use_docker=use_docker,
182
+ )
183
+ except Exception as e:
184
+ logging.error(e)
185
+ logging.info(
186
+ f"{action} {dbname} took {t_dump.mins} mins {t_dump.secs:.2f} sec"
187
+ )
188
+ logging.info(f"all {action} took {t_all.mins} mins {t_all.secs:.2f} sec")
189
+
190
+
191
+ if __name__ == "__main__":
192
+ logging.basicConfig(level=logging.INFO, stream=sys.stdout)
193
+ manage_dbs()
@@ -0,0 +1,132 @@
1
+ """Schema visualization tool for graph databases.
2
+
3
+ This module provides functionality for visualizing graph database schemas using Graphviz.
4
+ It includes tools for plotting vertex-to-vertex relationships, vertex fields, and resource
5
+ mappings. The module supports various visualization options and graph layout customization.
6
+
7
+ Key Components:
8
+ - SchemaPlotter: Main class for schema visualization
9
+ - knapsack: Utility for optimizing graph layout
10
+ - plot_schema: CLI command for schema visualization
11
+
12
+ Graphviz Attributes Reference:
13
+ - https://renenyffenegger.ch/notes/tools/Graphviz/attributes/index
14
+ - https://rsms.me/graphviz/
15
+ - https://graphviz.readthedocs.io/en/stable/examples.html
16
+ - https://graphviz.org/doc/info/attrs.html
17
+
18
+ Example:
19
+ >>> plot_schema(schema_path="schema.yaml", figure_output_path="schema.png")
20
+ """
21
+
22
+ import logging
23
+ import sys
24
+
25
+ import click
26
+
27
+ from graflo.plot.plotter import SchemaPlotter
28
+
29
+ """
30
+
31
+ graphviz attributes
32
+
33
+ https://renenyffenegger.ch/notes/tools/Graphviz/attributes/index
34
+ https://rsms.me/graphviz/
35
+ https://graphviz.readthedocs.io/en/stable/examples.html
36
+ https://graphviz.org/doc/info/attrs.html
37
+
38
+ usage:
39
+ color='red',style='filled', fillcolor='blue',shape='square'
40
+
41
+ to keep
42
+ level_one = [node1, node2]
43
+ sg_one = ag.add_subgraph(level_one, rank='same')
44
+
45
+ """
46
+
47
+
48
+ def knapsack(weights, ks_size=7):
49
+ """Split a set of weights into groups of at most threshold weight.
50
+
51
+ This function implements a greedy algorithm to partition weights into groups
52
+ where each group's total weight is at most ks_size. It's used for optimizing
53
+ graph layout by balancing node distribution.
54
+
55
+ Args:
56
+ weights: List of weights to partition
57
+ ks_size: Maximum total weight per group (default: 7)
58
+
59
+ Returns:
60
+ list[list[int]]: List of groups, where each group is a list of indices
61
+ from the original weights list
62
+
63
+ Raises:
64
+ ValueError: If any single weight exceeds ks_size
65
+
66
+ Example:
67
+ >>> weights = [3, 4, 2, 5, 1]
68
+ >>> knapsack(weights, ks_size=7)
69
+ [[4, 0, 2], [1, 3]] # Groups with weights [6, 7]
70
+ """
71
+ pp = sorted(list(zip(range(len(weights)), weights)), key=lambda x: x[1])
72
+ print(pp)
73
+ acc = []
74
+ if pp[-1][1] > ks_size:
75
+ raise ValueError("One of the items is larger than the knapsack")
76
+
77
+ while pp:
78
+ w_item = []
79
+ w_item += [pp.pop()]
80
+ ww_item = sum([item for _, item in w_item])
81
+ while ww_item < ks_size:
82
+ cnt = 0
83
+ for j, item in enumerate(pp[::-1]):
84
+ diff = ks_size - item[1] - ww_item
85
+ if diff >= 0:
86
+ cnt += 1
87
+ w_item += [pp.pop(len(pp) - j - 1)]
88
+ ww_item += w_item[-1][1]
89
+ else:
90
+ break
91
+ if ww_item >= ks_size or cnt == 0:
92
+ acc += [w_item]
93
+ break
94
+ acc_ret = [[y for y, _ in subitem] for subitem in acc]
95
+ return acc_ret
96
+
97
+
98
+ @click.command()
99
+ @click.option("-c", "--schema-path", type=click.Path(), required=True)
100
+ @click.option("-o", "--figure-output-path", type=click.Path(), required=True)
101
+ @click.option("-p", "--prune-low-degree-nodes", type=bool, default=False)
102
+ def plot_schema(schema_path, figure_output_path, prune_low_degree_nodes):
103
+ """Generate visualizations of the graph database schema.
104
+
105
+ This command creates multiple visualizations of the schema:
106
+ 1. Vertex-to-vertex relationships
107
+ 2. Vertex fields and their relationships
108
+ 3. Resource mappings
109
+
110
+ The visualizations are saved to the specified output path.
111
+
112
+ Args:
113
+ schema_path: Path to the schema configuration file
114
+ figure_output_path: Path where the visualization will be saved
115
+ prune_low_degree_nodes: Whether to remove nodes with low connectivity
116
+ from the visualization (default: False)
117
+
118
+ Example:
119
+ $ uv run plot_schema -c schema.yaml -o schema.png
120
+ """
121
+ logging.basicConfig(level=logging.INFO, stream=sys.stdout)
122
+
123
+ plotter = SchemaPlotter(schema_path, figure_output_path)
124
+ plotter.plot_vc2vc(prune_leaves=prune_low_degree_nodes)
125
+ plotter.plot_vc2fields()
126
+ plotter.plot_resources()
127
+ # plotter.plot_source2vc()
128
+ # plotter.plot_source2vc_detailed()
129
+
130
+
131
+ if __name__ == "__main__":
132
+ plot_schema()
graflo/cli/xml2json.py ADDED
@@ -0,0 +1,93 @@
1
+ """XML to JSON conversion tool for data preprocessing.
2
+
3
+ This module provides a command-line tool for converting XML files to JSON format,
4
+ with support for different data sources and chunking options. It's particularly
5
+ useful for preprocessing scientific literature data from sources like Web of Science
6
+ and PubMed.
7
+
8
+ Key Features:
9
+ - Support for Web of Science and PubMed XML formats
10
+ - Configurable chunking for large files
11
+ - Batch processing of multiple files
12
+ - Customizable output format
13
+
14
+ Example:
15
+ $ uv run xml2json \\
16
+ --source-path data/wos.xml \\
17
+ --chunk-size 1000 \\
18
+ --mode wos_csv
19
+ """
20
+
21
+ import logging
22
+ import pathlib
23
+ import sys
24
+
25
+ import click
26
+
27
+ from graflo.util.chunker import convert, force_list_wos, tag_wos
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+
32
+ @click.command()
33
+ @click.option(
34
+ "-s",
35
+ "--source-path",
36
+ type=click.Path(path_type=pathlib.Path),
37
+ required=True,
38
+ )
39
+ @click.option("-c", "--chunk-size", type=int, default=1000)
40
+ @click.option("-m", "--max-chunks", type=int, default=None)
41
+ @click.option("--mode", type=str)
42
+ def do(source_path, chunk_size, max_chunks, mode):
43
+ """Convert XML files to JSON format.
44
+
45
+ This command processes XML files and converts them to JSON format, with support
46
+ for different data sources and chunking options.
47
+
48
+ Args:
49
+ source_path: Path to source XML file or directory
50
+ chunk_size: Number of records per output file (default: 1000)
51
+ max_chunks: Maximum number of chunks to process (default: None)
52
+ mode: Data source mode ('wos_csv' or 'pubmed')
53
+
54
+ Example:
55
+ $ uv run xml2json \\
56
+ --source-path data/wos.xml \\
57
+ --chunk-size 1000 \\
58
+ --mode wos_csv
59
+ """
60
+ if mode == "wos_csv":
61
+ pattern = r"xmlns=\".*[^\"]\"(?=>)"
62
+ force_list = force_list_wos
63
+ tag = tag_wos
64
+ elif mode == "pubmed":
65
+ pattern = None
66
+ force_list = None
67
+ tag = "PubmedArticle"
68
+ else:
69
+ raise ValueError(f"Unknown mode {mode}")
70
+
71
+ if source_path.is_dir():
72
+ files = [
73
+ fp for fp in source_path.iterdir() if not fp.is_dir() and "xml" in fp.name
74
+ ]
75
+ else:
76
+ files = [source_path] if ".xml." in source_path.name else []
77
+ for fp in files:
78
+ target_root = str(fp.parent / fp.name.split(".")[0])
79
+
80
+ convert(
81
+ fp,
82
+ target_root=target_root,
83
+ chunk_size=chunk_size,
84
+ max_chunks=max_chunks,
85
+ pattern=pattern,
86
+ force_list=force_list,
87
+ root_tag=tag,
88
+ )
89
+
90
+
91
+ if __name__ == "__main__":
92
+ logging.basicConfig(level=logging.INFO, stream=sys.stdout)
93
+ do()
graflo/db/__init__.py ADDED
@@ -0,0 +1,32 @@
1
+ """Database connection and management components.
2
+
3
+ This package provides database connection implementations and management utilities
4
+ for different graph databases (ArangoDB, Neo4j). It includes connection interfaces,
5
+ query execution, and database operations.
6
+
7
+ Key Components:
8
+ - Connection: Abstract database connection interface
9
+ - ConnectionManager: Database connection management
10
+ - ArangoDB: ArangoDB-specific implementation
11
+ - Neo4j: Neo4j-specific implementation
12
+ - Query: Query generation and execution utilities
13
+
14
+ Example:
15
+ >>> from graflo.db import ConnectionManager
16
+ >>> from graflo.db.arango import ArangoConnection
17
+ >>> manager = ConnectionManager(
18
+ ... connection_config={"url": "http://localhost:8529"},
19
+ ... conn_class=ArangoConnection
20
+ ... )
21
+ >>> with manager as conn:
22
+ ... conn.init_db(schema)
23
+ """
24
+
25
+ from .connection import Connection, ConnectionType
26
+ from .manager import ConnectionManager
27
+
28
+ __all__ = [
29
+ "Connection",
30
+ "ConnectionManager",
31
+ "ConnectionType",
32
+ ]
@@ -0,0 +1,16 @@
1
+ """ArangoDB database implementation.
2
+
3
+ This package provides ArangoDB-specific implementations of the database interface,
4
+ including connection management, query execution, and utility functions.
5
+
6
+ Key Components:
7
+ - ArangoConnection: ArangoDB connection implementation
8
+ - Query: AQL query execution and profiling
9
+ - Util: ArangoDB-specific utility functions
10
+
11
+ Example:
12
+ >>> from graflo.db.arango import ArangoConnection
13
+ >>> conn = ArangoConnection(config)
14
+ >>> cursor = conn.execute("FOR doc IN users RETURN doc")
15
+ >>> results = cursor.batch()
16
+ """