makegis 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
makegis/__init__.py ADDED
@@ -0,0 +1 @@
1
+ __version__ = "0.1.0"
makegis/cli.py ADDED
@@ -0,0 +1,235 @@
1
+ import argparse
2
+ import logging
3
+ from pathlib import Path
4
+ import sys
5
+
6
+ import dotenv
7
+ from rich.console import Console
8
+ from rich.logging import RichHandler
9
+
10
+ from . import __version__
11
+ from .config import RootConfig
12
+ from .dag.builder import Builder
13
+ from .targets import Target
14
+ from . import errors
15
+
16
+ console = Console()
17
+
18
+ log = logging.getLogger("makegis")
19
+
20
+
21
+ def cli():
22
+
23
+ # Handle general -v and -d options outside of argparse.
24
+ args = sys.argv[1:]
25
+ verbose_flags = ["-v", "--verbose"]
26
+ debug_flags = ["--debug"]
27
+ debug = any([flag in args for flag in debug_flags])
28
+ verbose = not debug and any([flag in args for flag in verbose_flags])
29
+ args = [a for a in args if a not in verbose_flags + debug_flags]
30
+
31
+ # Configure logger
32
+ level = logging.WARN
33
+ format = "%(message)s"
34
+ datefmt = "[%X]"
35
+ show_time = False
36
+ show_path = False
37
+ if debug:
38
+ level = logging.DEBUG
39
+ show_path = True
40
+ elif verbose:
41
+ level = logging.INFO
42
+ logging.basicConfig(
43
+ level=level,
44
+ format=format,
45
+ datefmt=datefmt,
46
+ handlers=[
47
+ RichHandler(
48
+ console=console,
49
+ rich_tracebacks=True,
50
+ show_path=show_path,
51
+ show_time=show_time,
52
+ )
53
+ ],
54
+ )
55
+ log.info(f"makegis {__version__}")
56
+
57
+ parser = argparse.ArgumentParser(prog="mkgs")
58
+
59
+ # The --verbose and --debug options are parsed outside of argparse but we still declare
60
+ # them here so they show up as general options in the generated help.
61
+ parser.add_argument("-v", "--verbose", action="store_true", help="verbose messages")
62
+ parser.add_argument("--debug", action="store_true", help="debug messages")
63
+
64
+ subparsers = parser.add_subparsers(dest="command", help="commands")
65
+
66
+ def add_target_argument(parser):
67
+ parser.add_argument(
68
+ "-t",
69
+ "--target",
70
+ action="store",
71
+ type=str,
72
+ default=None,
73
+ help="db instance to target",
74
+ )
75
+
76
+ init_parser = subparsers.add_parser("init", help="create schemas and journal table")
77
+ add_target_argument(init_parser)
78
+ init_parser.set_defaults(func=init)
79
+
80
+ list_parser = subparsers.add_parser("ls", help="list nodes")
81
+ list_parser.add_argument("pattern", type=str, help="DAG selection pattern")
82
+ list_parser.set_defaults(func=show)
83
+
84
+ outdated_parser = subparsers.add_parser("outdated", help="report outdated nodes")
85
+ add_target_argument(outdated_parser)
86
+ outdated_parser.set_defaults(func=outdated)
87
+
88
+ run_parser = subparsers.add_parser("run", help="run nodes")
89
+ run_parser.add_argument("pattern", type=str, help="DAG selection pattern")
90
+ add_target_argument(run_parser)
91
+ run_parser.add_argument(
92
+ "-d",
93
+ "--dry-run",
94
+ action="store_true",
95
+ help="process nodes without actually running them",
96
+ )
97
+ run_parser.add_argument(
98
+ "-f",
99
+ "--force",
100
+ action="store_true",
101
+ help="also run fresh nodes",
102
+ )
103
+ run_parser.set_defaults(func=run)
104
+
105
+ # Load .env
106
+ dotenv.load_dotenv(".env")
107
+
108
+ # Parse preprocessed args
109
+ args = parser.parse_args(args)
110
+
111
+ # Inject verbose or debug option
112
+ args.verbose = verbose
113
+ args.debug = debug
114
+
115
+ # Call handler
116
+ if hasattr(args, "func"):
117
+ args.func(args)
118
+ else:
119
+ parser.print_help()
120
+
121
+
122
+ def init(args):
123
+ cfg = load_root_config()
124
+ target_id = args.target or cfg.defaults.target
125
+ assert target_id is not None
126
+ log.info(f"using target {target_id}")
127
+ target = Target(cfg.targets[target_id])
128
+
129
+ dag = Builder(cfg).build()
130
+ target.ensure_schemas(dag.list_schemas())
131
+ target.init_journal()
132
+
133
+
134
+ def outdated(args):
135
+ cfg = load_root_config()
136
+ target_id = args.target or cfg.defaults.target
137
+ assert target_id is not None
138
+ log.info(f"using target {target_id}")
139
+ target = Target(cfg.targets[target_id])
140
+
141
+ dag = Builder(cfg).build()
142
+ node_ids = dag.get_outdated(target)
143
+ if not node_ids:
144
+ print("All nodes are up to date.")
145
+
146
+ for node_id in node_ids:
147
+ print(dag.render_node(node_id))
148
+
149
+
150
+ def run(args):
151
+ cfg = load_root_config()
152
+
153
+ target_id = args.target or cfg.defaults.target
154
+ assert target_id is not None
155
+ log.info(f"using target {target_id}")
156
+ target = Target(cfg.targets[target_id])
157
+
158
+ dry_run = args.dry_run == True
159
+ if args.dry_run:
160
+ log.info("dry run - target will not be modified")
161
+
162
+ dag = Builder(cfg).build()
163
+ node_ids = dag.select_nodes(args.pattern)
164
+ if not node_ids:
165
+ print("No nodes matching selection pattern.")
166
+ return
167
+
168
+ if not args.force:
169
+ outdated = dag.get_outdated(target, limit_to=node_ids)
170
+ node_ids = outdated
171
+
172
+ if not node_ids:
173
+ print("All selected nodes are up to date. Use --force to run anyways.")
174
+ return
175
+
176
+ n = len(node_ids)
177
+ with console.status("") as status:
178
+ for inode, node_id in enumerate(node_ids):
179
+ status.update(f"Running node {inode + 1}/{n}: {node_id}")
180
+ if args.dry_run:
181
+ log.info(f"dry running node '{node_id}'")
182
+ continue
183
+ log.info(f"running node '{node_id}'")
184
+ try:
185
+ dag.run_node(node_id, target)
186
+ except errors.FailedNodeRun as e:
187
+ log.error(e.message)
188
+ return
189
+ except Exception:
190
+ log.exception(f"node '{node_id}' run failed!")
191
+ return
192
+ if args.dry_run:
193
+ print(f"Dry run done. Would've run {n} node(s).")
194
+ else:
195
+ print(f"Done. Ran {n} node(s).")
196
+
197
+
198
+ def show(args):
199
+ cfg = load_root_config()
200
+ dag = Builder(cfg).build()
201
+ node_ids = dag.select_nodes(args.pattern)
202
+ if not node_ids:
203
+ print("No matching nodes.")
204
+ return
205
+
206
+ for node_id in node_ids:
207
+ print(dag.render_node(node_id))
208
+
209
+
210
+ def load_root_config():
211
+ cfg_path = find_root_config()
212
+
213
+ # Load .env in same dir as makegis.root.yml
214
+ cfg_dir = cfg_path.parent
215
+ dotenv.load_dotenv(cfg_dir / ".env")
216
+
217
+ return RootConfig.from_file(cfg_path)
218
+
219
+
220
+ def find_root_config(cwd: Path = Path(".").resolve()):
221
+ """
222
+ Returns path to first makegis.root.yml file found in current dir or parents.
223
+ """
224
+ path = cwd / "makegis.root.yml"
225
+ if path.exists():
226
+ return path
227
+ parent = cwd.parent
228
+ if parent == cwd:
229
+ log.error("Found no makegis root file in current directory or its parents.")
230
+ exit(1)
231
+ return find_root_config(cwd=parent)
232
+
233
+
234
+ if __name__ == "__main__":
235
+ cli()
@@ -0,0 +1,2 @@
1
+ from .root import RootConfig
2
+ from .root import TargetConfig
@@ -0,0 +1,253 @@
1
+ import logging
2
+ from pathlib import Path
3
+ from typing import Dict
4
+ from typing import List
5
+ from typing import Optional
6
+ from typing import Literal
7
+
8
+ from pydantic import BaseModel
9
+ from pydantic import model_validator
10
+ from pydantic import ValidationError
11
+ import yaml
12
+
13
+ try:
14
+ from yaml import CLoader as Loader
15
+ except ImportError:
16
+ from yaml import Loader
17
+
18
+ from .utils import expand_dict_strings
19
+
20
+ log = logging.getLogger("makegis")
21
+
22
+
23
+ class LoadDefaults(BaseModel):
24
+ epsg: int | str | None = None
25
+ geom_index: bool | None = None
26
+ geom_column: str | None = None
27
+ raster_index: bool | None = None
28
+ raster_column: str | None = None
29
+ raster_constraints: bool | None = None
30
+ tile_size: int | None = None
31
+
32
+
33
+ class BaseSourceBlock(BaseModel):
34
+ epsg: int | str | None = None
35
+ # Name of column to use as primary key
36
+ pk: str | None = None
37
+
38
+
39
+ class VectorSourceBlock(BaseModel):
40
+ geom_index: bool | None = None
41
+ geom_column: str | None = None
42
+
43
+
44
+ class CSVSourceBlock(BaseSourceBlock, VectorSourceBlock):
45
+ type: Literal["csv"] = "csv"
46
+ path: Path
47
+ # TODO:
48
+ # x_column: str | None = None
49
+ # y_column: str | None = None
50
+ # keep_xy_columns: bool = False
51
+
52
+
53
+ class EsriSourceBlock(BaseSourceBlock, VectorSourceBlock):
54
+ type: Literal["esri"] = "esri"
55
+ url: str
56
+ f: Literal["pjson", "pgeojson"] = "pjson"
57
+
58
+
59
+ class DuckDBSourceBlock(BaseSourceBlock, VectorSourceBlock):
60
+ type: Literal["duckdb"] = "duckdb"
61
+ path: Path
62
+ table: Optional[str] = None
63
+
64
+
65
+ class FileSourceBlock(BaseSourceBlock, VectorSourceBlock):
66
+ type: Literal["file"] = "file"
67
+ path: Path
68
+ layer: str | None = None
69
+
70
+
71
+ class RasterSourceBlock(BaseSourceBlock):
72
+ type: Literal["raster"] = "raster"
73
+ path: Path
74
+ raster_index: bool | None = None
75
+ raster_column: str | None = None
76
+ raster_constraints: bool | None = None
77
+ tile_size: int | None = None
78
+
79
+
80
+ class WFSSourceBlock(BaseSourceBlock, VectorSourceBlock):
81
+ type: Literal["wfs"] = "wfs"
82
+ url: str
83
+
84
+
85
+ type SourceBlock = CSVSourceBlock | EsriSourceBlock | DuckDBSourceBlock | FileSourceBlock | RasterSourceBlock | WFSSourceBlock
86
+
87
+ SOURCE_KEYS = set(["csv", "esri", "duckdb", "file", "raster", "wfs"])
88
+
89
+
90
+ class LoadItem(BaseModel):
91
+ name: str
92
+ src: SourceBlock
93
+ meta: Dict[str, str | int | float | None]
94
+
95
+ @classmethod
96
+ def from_kv(cls, k: str, v: Dict):
97
+ name = k
98
+ meta = v.pop("meta", {})
99
+ matched_source_keys = [sk for sk in SOURCE_KEYS if sk in v]
100
+ if len(matched_source_keys) == 0:
101
+ raise RuntimeError(
102
+ f"Missing source key in load block item, execting one of {SOURCE_KEYS}"
103
+ )
104
+ elif len(matched_source_keys) > 1:
105
+ raise RuntimeError(
106
+ f"Too many source keys in load block item, expecting exactly one of {SOURCE_KEYS}"
107
+ )
108
+ if "csv" in matched_source_keys:
109
+ path = v.pop("csv")
110
+ src = CSVSourceBlock(path=path, **v)
111
+ elif "esri" in matched_source_keys:
112
+ url = v.pop("esri")
113
+ src = EsriSourceBlock(url=url, **v)
114
+ elif "duckdb" in matched_source_keys:
115
+ path = v.pop("duckdb")
116
+ src = DuckDBSourceBlock(path=path, **v)
117
+ elif "file" in matched_source_keys:
118
+ path = v.pop("file")
119
+ src = FileSourceBlock(path=path, **v)
120
+ elif "raster" in matched_source_keys:
121
+ path = v.pop("raster")
122
+ src = RasterSourceBlock(path=path, **v)
123
+ elif "wfs" in matched_source_keys:
124
+ url = v.pop("wfs")
125
+ src = WFSSourceBlock(url=url, **v)
126
+ else:
127
+ raise NotImplementedError("Unhandled source key in load block item")
128
+
129
+ return LoadItem(name=name, src=src, meta=meta)
130
+
131
+
132
+ class SQLTransform(BaseModel):
133
+ path: Path
134
+
135
+
136
+ class LoadBlock(BaseModel):
137
+ defaults: LoadDefaults
138
+ items: List[LoadItem]
139
+
140
+ @classmethod
141
+ def from_dict(cls, d: Dict):
142
+ defaults = LoadDefaults(**d.pop("defaults", {}))
143
+ items = [LoadItem.from_kv(k, v) for k, v in d.items()]
144
+ return LoadBlock(defaults=defaults, items=items)
145
+
146
+
147
+ class TransformBlock(BaseModel):
148
+ transforms: List[SQLTransform]
149
+
150
+ @classmethod
151
+ def from_sequence(cls, s: List):
152
+ transforms = [SQLTransform(path=p) for p in s]
153
+ return TransformBlock(transforms=transforms)
154
+
155
+
156
+ class DatabaseItem(BaseModel):
157
+ type: Literal["table", "function"]
158
+ name: str
159
+
160
+ @classmethod
161
+ def from_dict(cls, d: Dict):
162
+ assert (
163
+ len(d) == 1
164
+ ), "each item in a 'creates' or 'deps' block must have exactly 1 key e.g. - table: name"
165
+ k, v = next(iter(d.items()))
166
+ return DatabaseItem(type=k, name=v)
167
+
168
+
169
+ class RunTask(BaseModel):
170
+ cmd: str
171
+ creates: List[DatabaseItem]
172
+
173
+ @classmethod
174
+ def from_dict(cls, d: Dict):
175
+ creates = [DatabaseItem.from_dict(item) for item in d.pop("creates", [])]
176
+ return RunTask(creates=creates, **d)
177
+
178
+
179
+ class DoBlock(BaseModel):
180
+ """The 'do' key in a 'node' block"""
181
+
182
+ load: Optional[LoadBlock] = None
183
+ run: Optional[List[RunTask]] = None
184
+
185
+ @model_validator(mode="after")
186
+ def at_least_one(self):
187
+ if self.load is None and self.run is None:
188
+ raise ValidationError(
189
+ "A node's do block must have a 'load' and/or a 'run' key"
190
+ )
191
+ return self
192
+
193
+ @classmethod
194
+ def from_dict(cls, d: Dict):
195
+ load, tasks = None, None
196
+ if "load" in d:
197
+ load = LoadBlock.from_dict(d["load"])
198
+ if "run" in d:
199
+ tasks = [RunTask.from_dict(t) for t in d["run"]]
200
+ return DoBlock(load=load, run=tasks)
201
+
202
+
203
+ class NodeBlock(BaseModel):
204
+ """Top-level 'node' block in a makegis.yml file"""
205
+
206
+ deps: List[DatabaseItem] | None = []
207
+ prep: List[str] | None = []
208
+ do: DoBlock
209
+ post: List[str] | None = []
210
+ cleanup: List[str] | None = []
211
+
212
+ @classmethod
213
+ def from_dict(cls, d: Dict):
214
+ deps = [DatabaseItem.from_dict(d) for d in d.pop("deps", [])]
215
+ do = DoBlock.from_dict(d.pop("do"))
216
+ return NodeBlock(deps=deps, do=do, **d)
217
+
218
+
219
+ class MakeGISConfig(BaseModel):
220
+ block: LoadBlock | TransformBlock | NodeBlock
221
+ type: Literal["load", "transform", "node"]
222
+
223
+ @classmethod
224
+ def from_file(cls, path: Path):
225
+ log.debug(f"reading {path}")
226
+ with open(path) as f:
227
+ d = yaml.load(f, Loader)
228
+ return cls.from_dict(d)
229
+
230
+ @classmethod
231
+ def from_yaml(cls, s: str):
232
+ d = yaml.load(s, Loader)
233
+ return cls.from_dict(d)
234
+
235
+ @classmethod
236
+ def from_dict(cls, d: Dict):
237
+ expand_dict_strings(d)
238
+ assert len(d) == 1
239
+ key = list(d)[0]
240
+ if key == "load":
241
+ typ = "load"
242
+ block = LoadBlock.from_dict(d["load"])
243
+ elif key == "transform":
244
+ typ = "transform"
245
+ block = TransformBlock.from_sequence(d["transform"])
246
+ elif key == "node":
247
+ typ = "node"
248
+ block = NodeBlock.from_dict(d["node"])
249
+ else:
250
+ raise RuntimeError(
251
+ f"Unknown makegis file key '{key}', should be one of load, transform or node"
252
+ )
253
+ return MakeGISConfig(type=typ, block=block)
makegis/config/root.py ADDED
@@ -0,0 +1,69 @@
1
+ import logging
2
+ from pathlib import Path
3
+ from typing import Dict
4
+
5
+ from pydantic import BaseModel
6
+ import yaml
7
+
8
+ try:
9
+ from yaml import CLoader as Loader
10
+ except ImportError:
11
+ from yaml import Loader
12
+
13
+ from .makegis import LoadDefaults
14
+ from .utils import expand_dict_strings
15
+
16
+ log = logging.getLogger("makegis")
17
+
18
+
19
+ class RootDefaults(BaseModel):
20
+ load: LoadDefaults = LoadDefaults()
21
+ target: str | None = None
22
+
23
+
24
+ class TargetConfig(BaseModel):
25
+ """Describes a target database"""
26
+
27
+ # optional description
28
+ description: str | None = None
29
+ host: str = "localhost"
30
+ port: int = 5432
31
+ user: str = "postgres"
32
+ # database name
33
+ db: str
34
+
35
+ def conn_uri(self) -> str:
36
+ s = self
37
+ return f"postgresql://{s.user}@{s.host}:{s.port}/{s.db}"
38
+
39
+ def conn_str(self) -> str:
40
+ s = self
41
+ return f"host={s.host} port={s.port} dbname={s.db} user={s.user}"
42
+
43
+
44
+ class RootConfig(BaseModel):
45
+ src_dir: Path
46
+ defaults: RootDefaults
47
+ targets: Dict[str, TargetConfig]
48
+
49
+ @classmethod
50
+ def from_file(cls, path: Path):
51
+ log.debug(f"reading {path}")
52
+ with open(path) as f:
53
+ d = yaml.load(f, Loader)
54
+ rc = cls.from_dict(d)
55
+ # Resolve path of src dir
56
+ if not rc.src_dir.is_absolute():
57
+ rc.src_dir = (path.parent / rc.src_dir).resolve()
58
+ return rc
59
+
60
+ @classmethod
61
+ def from_yaml(cls, s: str):
62
+ d = yaml.load(s, Loader)
63
+ return cls.from_dict(d)
64
+
65
+ @classmethod
66
+ def from_dict(cls, d: Dict):
67
+ expand_dict_strings(d)
68
+ defaults = RootDefaults(**d.pop("defaults", {}))
69
+ return RootConfig(defaults=defaults, **d)
@@ -0,0 +1,24 @@
1
+ import os
2
+ from typing import Dict
3
+ import re
4
+
5
+
6
+ def expand_dict_strings(raw_dict: Dict):
7
+ """
8
+ Replaces {{variables}} found in strings in place.
9
+ """
10
+ pattern = re.compile(r"\{\{\s*(\w+)\s*\}\}")
11
+
12
+ def expand_string_values(d: dict):
13
+ for k, v in d.items():
14
+ if isinstance(v, dict):
15
+ expand_string_values(v)
16
+ if isinstance(v, str):
17
+ vars = re.findall(pattern, v)
18
+ for var in vars:
19
+ if var not in os.environ:
20
+ raise RuntimeError(f"unmatched env var {var}")
21
+ v = re.sub(rf"\{{\{{\s*{var}\s*\}}\}}", os.environ[var], v)
22
+ d[k] = v
23
+
24
+ expand_string_values(raw_dict)
File without changes
@@ -0,0 +1,7 @@
1
+ from pathlib import Path
2
+
3
+ from pydantic import BaseModel
4
+
5
+
6
+ class Command(BaseModel):
7
+ path: Path
makegis/core/load.py ADDED
@@ -0,0 +1,77 @@
1
+ from pathlib import Path
2
+ from dataclasses import dataclass
3
+ from typing import Literal
4
+
5
+
6
+ @dataclass(frozen=True)
7
+ class BaseSource:
8
+ # Explicit srid of geometry in source dataset
9
+ epsg: int | None
10
+ # Name of column to use as primary key
11
+ pk: str | None
12
+
13
+
14
+ @dataclass(frozen=True)
15
+ class CSVSource(BaseSource):
16
+ path: Path
17
+
18
+
19
+ @dataclass(frozen=True)
20
+ class EsriSource(BaseSource):
21
+ url: str
22
+ f: Literal["pgeojson", "pjson"]
23
+
24
+
25
+ @dataclass(frozen=True)
26
+ class DuckDBSource(BaseSource):
27
+ # Path to database file
28
+ path: Path
29
+ # Fully qualified name of table to import
30
+ table: str
31
+
32
+
33
+ @dataclass(frozen=True)
34
+ class FileSource(BaseSource):
35
+ path: Path
36
+ # Optional layer name for file formats supporting it
37
+ layer: str | None
38
+
39
+
40
+ @dataclass(frozen=True)
41
+ class RasterSource(BaseSource):
42
+ path: Path
43
+
44
+
45
+ @dataclass(frozen=True)
46
+ class WFSSource(BaseSource):
47
+ url: str
48
+
49
+
50
+ type Source = EsriSource | DuckDBSource | FileSource | WFSSource
51
+
52
+
53
+ @dataclass(frozen=True)
54
+ class Destination:
55
+ schema: str
56
+ table: str
57
+ # Desired srid of geometry column in destination table
58
+ epsg: int | None
59
+ # Name to assign to geometry column. Keep original if None.
60
+ geom_column: str | None
61
+ # Wether to index geometries or not.
62
+ # Defaults to False to be conservative.
63
+ geom_index: bool
64
+ # Name to assign to raster column.
65
+ raster_column: str
66
+ # Wether to index raster bounds
67
+ raster_index: bool
68
+ # Wether to set the standard set of raster constraints
69
+ raster_constraints: bool
70
+ # Raster tile size
71
+ tile_size: int | None
72
+
73
+
74
+ @dataclass(frozen=True)
75
+ class LoadJob:
76
+ src: Source
77
+ dst: Destination
@@ -0,0 +1,7 @@
1
+ from pathlib import Path
2
+
3
+ from pydantic import BaseModel
4
+
5
+
6
+ class Transform(BaseModel):
7
+ sql: Path
@@ -0,0 +1 @@
1
+ from .dag import DAG