jupyter-duckdb 0.3.2__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
duckdb_kernel/__init__.py CHANGED
@@ -0,0 +1 @@
1
+ from .kernel import DuckDBKernel
duckdb_kernel/kernel.json CHANGED
File without changes
duckdb_kernel/kernel.py CHANGED
@@ -1,18 +1,23 @@
1
1
  import json
2
+ import math
2
3
  import os
3
- import re
4
4
  import time
5
5
  import traceback
6
6
  from typing import Optional, Dict, List, Tuple
7
7
 
8
8
  import duckdb
9
9
  from ipykernel.kernelbase import Kernel
10
- import checkmarkandcross
10
+
11
+ from .magics import *
12
+ from .util.formatting import row_count, rows_table, wrap_image
13
+ from .visualization import *
11
14
 
12
15
 
13
16
  class DuckDBKernel(Kernel):
17
+ DEFAULT_MAX_ROWS = 20
18
+
14
19
  implementation = 'DuckDB'
15
- implementation_version = '0.6.1'
20
+ implementation_version = '0.8.1'
16
21
  banner = 'DuckDB Kernel'
17
22
  language_info = {
18
23
  'name': 'duckdb',
@@ -23,6 +28,20 @@ class DuckDBKernel(Kernel):
23
28
  def __init__(self, **kwargs):
24
29
  super().__init__(**kwargs)
25
30
 
31
+ # register magic commands
32
+ self._magics: MagicCommandHandler = MagicCommandHandler()
33
+
34
+ self._magics.add(
35
+ MagicCommand('create').arg('database').opt('of').opt('with_tests').on(self._create_magic),
36
+ MagicCommand('load').arg('database').opt('with_tests').on(self._load_magic),
37
+ MagicCommand('test').arg('name').result(True).on(self._test_magic),
38
+ MagicCommand('all', 'all_rows').on(self._all_magic),
39
+ MagicCommand('max_rows').arg('count').on(self._max_rows_magic),
40
+ MagicCommand('query_max_rows').arg('count').on(self._query_max_rows_magic),
41
+ MagicCommand('schema').opt('lr').on(self._schema_magic)
42
+ )
43
+
44
+ # create placeholders for database and tests
26
45
  self._db: Optional[duckdb.DuckDBPyConnection] = None
27
46
  self._tests: Optional[Dict] = None
28
47
 
@@ -36,7 +55,9 @@ class DuckDBKernel(Kernel):
36
55
  def print_exception(self, e: Exception):
37
56
  if isinstance(e, AssertionError):
38
57
  text = str(e)
39
- elif isinstance(e, (duckdb.OperationalError, duckdb.ProgrammingError)):
58
+ elif isinstance(e, MagicCommandException):
59
+ text = str(e)
60
+ elif isinstance(e, (duckdb.OperationalError, duckdb.ProgrammingError, duckdb.InvalidInputException)):
40
61
  text = str(e)
41
62
  else:
42
63
  text = traceback.format_exc()
@@ -70,7 +91,8 @@ class DuckDBKernel(Kernel):
70
91
  else:
71
92
  return False
72
93
 
73
- def _execute_stmt(self, query: str, silent: bool) -> Tuple[List[str], List[List]]:
94
+ def _execute_stmt(self, query: str, silent: bool,
95
+ max_rows: Optional[int]) -> Tuple[Optional[List[str]], Optional[List[List]]]:
74
96
  if self._db is None:
75
97
  raise AssertionError('load a database first')
76
98
 
@@ -81,22 +103,44 @@ class DuckDBKernel(Kernel):
81
103
  et = time.time()
82
104
 
83
105
  if not silent:
106
+ # print EXPLAIN queries as raw text
84
107
  if query.strip().startswith('EXPLAIN'):
85
108
  rows = cursor.fetchall()
86
109
  for ekey, evalue in rows:
87
110
  self.print_data(f'<b>{ekey}</b><br><pre>{evalue}</pre>')
88
111
 
112
+ return None, None
113
+
114
+ # print every other query as a table
89
115
  else:
90
116
  # table header
91
- table_header = ''.join(map(lambda e: f'<th>{e[0]}</th>', cursor.description))
117
+ if cursor.description is None:
118
+ columns = []
119
+ else:
120
+ columns = [e[0] for e in cursor.description]
121
+
122
+ table_header = ''.join(f'<th>{c}</th>' for c in columns)
92
123
 
93
124
  # table data
94
125
  rows = cursor.fetchall()
95
126
 
96
- table_data = ''.join(map(
97
- lambda row: '<tr>' + ''.join(map(lambda e: f'<td>{e}</td>', row)) + '</tr>',
98
- rows
99
- ))
127
+ if max_rows is not None and len(rows) > max_rows:
128
+ table_data = f'''
129
+ {rows_table(rows[:math.ceil(max_rows / 2)])}
130
+ <tr>
131
+ <td colspan="{len(columns)}"
132
+ style="text-align: center"
133
+ title="{row_count(len(rows) - max_rows)} omitted">
134
+ ...
135
+ </td>
136
+ </tr>
137
+ {rows_table(rows[-math.floor(max_rows // 2):])}
138
+ '''
139
+ else:
140
+ table_data = ''.join(map(
141
+ lambda row: '<tr>' + ''.join(map(lambda e: f'<td>{e}</td>', row)) + '</tr>',
142
+ rows
143
+ ))
100
144
 
101
145
  # send to client
102
146
  self.print_data(f'''
@@ -106,12 +150,18 @@ class DuckDBKernel(Kernel):
106
150
  </table>
107
151
  ''')
108
152
 
109
- self.print_data(f'{len(rows)} row{"" if len(rows) == 1 else "s"} in {et - st:.3f}s')
153
+ self.print_data(f'{row_count(len(rows))} in {et - st:.3f}s')
110
154
 
111
- return [e[0] for e in cursor.description], rows
155
+ return columns, rows
112
156
 
113
157
  # magic command related functions
114
- def _load_magic(self, silent: bool, target: str, create: bool, source: Optional[str], tests: Optional[str]):
158
+ def _create_magic(self, silent: bool, path: str, of: Optional[str], with_tests: Optional[str]):
159
+ self._load(silent, path, True, of, with_tests)
160
+
161
+ def _load_magic(self, silent: bool, path: str, with_tests: Optional[str]):
162
+ self._load(silent, path, False, None, with_tests)
163
+
164
+ def _load(self, silent: bool, path: str, create: bool, of: Optional[str], with_tests: Optional[str]):
115
165
  # unload current database if necessary
116
166
  if self._unload_database():
117
167
  if not silent:
@@ -121,24 +171,31 @@ class DuckDBKernel(Kernel):
121
171
  if not silent:
122
172
  self.print(f'{self.implementation} {self.implementation_version}\n')
123
173
 
124
- # load new database
125
- if target.startswith(("'", '"')):
126
- target = target[1:-1]
174
+ # clean path
175
+ if path.startswith(("'", '"')):
176
+ path = path[1:]
177
+ if path.endswith(("'", '"')):
178
+ path = path[:-1]
127
179
 
128
- if create and os.path.exists(target):
129
- os.remove(target)
180
+ # load new database
181
+ if create and os.path.exists(path):
182
+ os.remove(path)
130
183
 
131
- if self._load_database(target, read_only=False):
184
+ if self._load_database(path, read_only=False):
132
185
  if not silent:
133
- self.print(f'loaded database {target}\n')
186
+ self.print(f'loaded database {path}\n')
134
187
 
135
188
  # copy data from source database
136
- if source is not None:
137
- if source.startswith(("'", '"')):
138
- source = source[1:-1]
139
-
140
- if source.endswith('.sql'):
141
- with open(source, 'r') as file:
189
+ if of is not None:
190
+ # clean path
191
+ if of.startswith(("'", '"')):
192
+ of = of[1:]
193
+ if of.endswith(("'", '"')):
194
+ of = of[:-1]
195
+
196
+ # load sql files
197
+ if of.endswith('.sql'):
198
+ with open(of, 'r') as file:
142
199
  content = file.read()
143
200
 
144
201
  # statements = re.split(r';\r?\n', content)
@@ -148,27 +205,28 @@ class DuckDBKernel(Kernel):
148
205
  self._db.execute(content)
149
206
 
150
207
  if not silent:
151
- self.print(f'executed {source}')
208
+ self.print(f'executed {of}\n')
152
209
 
210
+ # load database files
153
211
  else:
154
- with duckdb.connect(source, read_only=True) as source_db:
155
- source_db.execute('SHOW TABLES')
156
- for table, in source_db.fetchall():
157
- transfer_df = source_db.query(f'SELECT * FROM {table}').to_df()
212
+ with duckdb.connect(of, read_only=True) as of_db:
213
+ of_db.execute('SHOW TABLES')
214
+ for table, in of_db.fetchall():
215
+ transfer_df = of_db.query(f'SELECT * FROM {table}').to_df()
158
216
  self._db.execute(f'CREATE TABLE {table} AS SELECT * FROM transfer_df')
159
217
 
160
218
  if not silent:
161
219
  self.print(f'transferred table {table}\n')
162
220
 
163
221
  # load tests
164
- if tests is None:
222
+ if with_tests is None:
165
223
  self._tests = {}
166
224
  else:
167
- with open(tests, 'r') as tests_file:
225
+ with open(with_tests, 'r') as tests_file:
168
226
  self._tests = json.load(tests_file)
169
- self.print(f'loaded tests from {tests}')
227
+ self.print(f'loaded tests from {with_tests}\n')
170
228
 
171
- def _test_magic(self, name: str, description: List[str], result: List[List], silent: bool):
229
+ def _test_magic(self, silent: bool, _: List[str], result: List[List], name: str):
172
230
  # Testing makes no sense if there is no output.
173
231
  if silent:
174
232
  return
@@ -176,124 +234,127 @@ class DuckDBKernel(Kernel):
176
234
  # extract data for test
177
235
  data = self._tests[name]
178
236
 
179
- # prepare comparison functions
180
- def my_equals(row1, row2):
181
- return len(row1) == len(row2) and all((x == y for x, y in zip(row1, row2)))
182
-
183
- def my_in(row, rows):
184
- for r in rows:
185
- if my_equals(r, row):
186
- return True
187
-
188
- return False
189
-
190
237
  # ordered test
191
238
  if data['ordered']:
239
+ def my_equals(row1, row2):
240
+ return len(row1) == len(row2) and all((x == y for x, y in zip(row1, row2)))
241
+
192
242
  rows = data['equals']
193
243
  missing = len(rows) - len(result)
194
244
 
195
245
  if missing > 0:
196
- return self.print_data(checkmarkandcross.image_html(
197
- False, title=f'{missing} row{"" if missing == 1 else "s"} missing'
198
- ))
246
+ return self.print_data(wrap_image(False, f'{row_count(missing)} missing'))
199
247
 
200
248
  if missing < 0:
201
- return self.print_data(checkmarkandcross.image_html(
202
- False, title=f'{-missing} row{"" if -missing == 1 else "s"} more than required'
203
- ))
249
+ return self.print_data(wrap_image(False, f'{row_count(-missing)} more than required'))
204
250
 
205
251
  for data_row, result_row in zip(data['equals'], result):
206
252
  if not my_equals(data_row, result_row):
207
- return self.print_data(checkmarkandcross.image_html(False, title='found row without match'))
253
+ return self.print_data(wrap_image(False, 'found row without match'))
208
254
 
209
- return self.print_data(checkmarkandcross.image_html(True, title='success'))
255
+ return self.print_data(wrap_image(True))
210
256
 
211
257
  # unordered test
212
258
  else:
213
- rows = data['equals']
259
+ # prepare data structures
260
+ test_tuples = [tuple(row) for row in data['equals']]
261
+ test_counts: Dict[Tuple, int] = {}
214
262
 
215
- missing = 0
216
- for element in rows:
217
- if not my_in(element, result):
218
- missing += 1
263
+ for row in test_tuples:
264
+ if row not in test_counts:
265
+ test_counts[row] = 1
266
+ else:
267
+ test_counts[row] += 1
219
268
 
220
- if missing > 0:
221
- return self.print_data(checkmarkandcross.image_html(
222
- False, title=f'{missing} row{"" if missing == 1 else "s"} missing'
223
- ))
224
-
225
- over = 0
226
- for element in result:
227
- if not my_in(element, rows):
228
- over += 1
229
-
230
- if over > 0:
231
- return self.print_data(checkmarkandcross.image_html(
232
- False, title=f'{over} row{"" if over == 1 else "s"} more than required'
233
- ))
234
-
235
- return self.print_data(checkmarkandcross.image_html(True, title='success'))
236
-
237
- def _handle_magic(self, code: str, silent: bool):
238
- code_lower = code.lower()
239
-
240
- if code_lower.startswith('%load'):
241
- # parse line
242
- match = re.match(
243
- r'''^%LOAD +([^ ]+?|'.+?'|".+?")( +WITH +([^ ]+?|'.+?'|".+?"))?$''',
244
- code.strip(), re.IGNORECASE
245
- )
246
- if match is None:
247
- raise AssertionError('usage: %LOAD target.db [WITH tests.json]')
248
-
249
- # call
250
- self._load_magic(silent, match.group(1), False, None, match.group(3))
251
-
252
- elif code_lower.startswith('%create'):
253
- # parse line
254
- match = re.match(
255
- r'''^%CREATE +([^ ]+?|'.+?'|".+?")( +FROM +([^ ]+?|'.+?'|".+?"))?( +WITH +([^ ]+?|'.+?'|".+?"))?$''',
256
- code.strip(), re.IGNORECASE
257
- )
258
- if match is None:
259
- raise AssertionError('usage: %CREATE target.db [FROM (source.db | source.sql)] [WITH tests.json]')
260
-
261
- # call
262
- self._load_magic(silent, match.group(1), True, match.group(3), match.group(5))
263
-
264
- elif code_lower.startswith('%test'):
265
- # parse line
266
- match = re.match(
267
- r'''^%TEST +([^ ]+?|'.+?'|".+?")$''',
268
- code, re.IGNORECASE | re.MULTILINE
269
- )
270
-
271
- if match is None:
272
- raise AssertionError('usage: %TEST name')
273
- if match.group(1) not in self._tests:
274
- raise AssertionError(f'test {match.group(1)} unknown')
275
-
276
- # execute statement
277
- description, rows = self._execute_stmt(code[match.end():], silent)
278
-
279
- # execute tests
280
- self._test_magic(match.group(1), description, rows, silent)
269
+ result_tuples = [tuple(row) for row in result]
270
+ result_counts: Dict[Tuple, int] = {}
281
271
 
272
+ for row in result_tuples:
273
+ if row not in result_counts:
274
+ result_counts[row] = 1
275
+ else:
276
+ result_counts[row] += 1
277
+
278
+ # calculate diffs
279
+ diff: Dict[Tuple, int] = {}
280
+
281
+ for row, count in test_counts.items():
282
+ diff[row] = result_counts.get(row, 0) - count
283
+
284
+ for row, count in result_counts.items():
285
+ if row not in diff:
286
+ diff[row] = count - test_counts.get(row, 0)
287
+
288
+ below = sum(max(0, -count) for count in diff.values())
289
+ above = sum(max(0, count) for count in diff.values())
290
+
291
+ # print result
292
+ if below > 0 and above > 0:
293
+ self.print_data(wrap_image(False, f'{row_count(below)} missing, {row_count(above)} unnecessary'))
294
+ elif below > 0:
295
+ self.print_data(wrap_image(False, f'{row_count(below)} missing'))
296
+ elif above > 0:
297
+ self.print_data(wrap_image(False, f'{row_count(above)} unnecessary'))
298
+ else:
299
+ self.print_data(wrap_image(True))
300
+
301
+ def _all_magic(self, silent: bool):
302
+ return {
303
+ 'max_rows': None
304
+ }
305
+
306
+ def _max_rows_magic(self, silent: bool, count: str):
307
+ if count.lower() != 'none':
308
+ DuckDBKernel.DEFAULT_MAX_ROWS = int(count)
309
+ else:
310
+ DuckDBKernel.DEFAULT_MAX_ROWS = None
311
+
312
+ def _query_max_rows_magic(self, silent: bool, count: str):
313
+ return {
314
+ 'max_rows': int(count) if count.lower() != 'none' else None
315
+ }
316
+
317
+ def _schema_magic(self, silent: bool, lr: Optional[str]):
318
+ if silent:
319
+ return
320
+
321
+ if lr.lower() == 'false':
322
+ lr = False
323
+ elif lr.isnumeric():
324
+ lr = bool(int(lr))
282
325
  else:
283
- raise AssertionError('unknown magic command')
326
+ lr = bool(lr)
327
+
328
+ vd = VizDrawer(self._db)
329
+ svg = vd.to_svg(lr)
330
+
331
+ self.print_data(svg)
284
332
 
285
333
  # jupyter related functions
286
334
  def do_execute(self, code: str, silent: bool,
287
335
  store_history: bool = True, user_expressions: dict = None, allow_stdin: bool = False,
288
336
  **kwargs):
289
337
  try:
290
- # handle magic commands
291
- if code.startswith('%'):
292
- self._handle_magic(code, silent)
338
+ # get magic command
339
+ clean_code, pre_query_callbacks, post_query_callbacks = self._magics(silent, code)
340
+
341
+ # execute magic commands here if it does not depend on query results
342
+ execution_args = {
343
+ 'max_rows': DuckDBKernel.DEFAULT_MAX_ROWS
344
+ }
293
345
 
294
- # execute statement otherwise
346
+ for callback in pre_query_callbacks:
347
+ execution_args.update(callback())
348
+
349
+ # execute statement if needed
350
+ if clean_code.strip():
351
+ cols, rows = self._execute_stmt(clean_code, silent, **execution_args)
295
352
  else:
296
- self._execute_stmt(code, silent)
353
+ cols, rows = None, None
354
+
355
+ # execute magic command here if it does depend on query results
356
+ for callback in post_query_callbacks:
357
+ callback(cols, rows)
297
358
 
298
359
  return {
299
360
  'status': 'ok',
@@ -0,0 +1,63 @@
1
+ from typing import Any, List, Tuple, Callable, Dict
2
+
3
+
4
+ class MagicCommand:
5
+ _ARG = '''([^ ]+?|'.+?'|".+?")'''
6
+
7
+ def __init__(self, *names: str):
8
+ self._names: Tuple[str] = names
9
+
10
+ self._arguments: List[Tuple[str, str]] = []
11
+ self._optionals: List[Tuple[str, Any, str]] = []
12
+ self._on: List[Callable] = []
13
+ self._result: bool = False
14
+
15
+ @property
16
+ def names(self) -> Tuple[str]:
17
+ return self._names
18
+
19
+ @property
20
+ def args(self) -> List[Tuple[str, str]]:
21
+ return self._arguments
22
+
23
+ @property
24
+ def kwargs(self) -> List[Tuple[str, Any, str]]:
25
+ return self._optionals
26
+
27
+ @property
28
+ def requires_query_result(self) -> bool:
29
+ return self._result
30
+
31
+ def arg(self, name: str, description: str = None) -> 'MagicCommand':
32
+ self._arguments.append((name, description))
33
+ return self
34
+
35
+ def opt(self, name: str, default_value: Any = None, description: str = None) -> 'MagicCommand':
36
+ self._optionals.append((name, default_value, description))
37
+ return self
38
+
39
+ def result(self, result: bool) -> 'MagicCommand':
40
+ self._result = result
41
+ return self
42
+
43
+ def on(self, fun: Callable):
44
+ self._on.append(fun)
45
+ return self
46
+
47
+ @property
48
+ def parameters(self) -> str:
49
+ args = ' +'.join([self._ARG] * len(self._arguments))
50
+ opts = ''.join([f'( +({name}) +{self._ARG})?' for name, *_ in self._optionals])
51
+
52
+ return f'^ *{args}{opts} *$'
53
+
54
+ def __call__(self, silent: bool, *args, **kwargs) -> Dict[str, Any]:
55
+ result = {}
56
+
57
+ for fun in self._on:
58
+ r = fun(silent, *args, **kwargs)
59
+ if r is not None:
60
+ for k, v in r.items():
61
+ result[k] = v
62
+
63
+ return result
@@ -0,0 +1,21 @@
1
+ from typing import Optional, List
2
+
3
+ from . import MagicCommand
4
+
5
+
6
+ class MagicCommandCallback:
7
+ def __init__(self, mc: MagicCommand, silent: bool, *args, **kwargs):
8
+ self._mc: MagicCommand = mc
9
+ self._silent: bool = silent
10
+ self._args = args
11
+ self._kwargs = kwargs
12
+
13
+ @property
14
+ def requires_query_result(self) -> bool:
15
+ return self._mc.requires_query_result
16
+
17
+ def __call__(self, columns: Optional[List[str]] = None, rows: Optional[List[List]] = None):
18
+ if self.requires_query_result:
19
+ return self._mc(self._silent, columns, rows, *self._args, **self._kwargs)
20
+ else:
21
+ return self._mc(self._silent, *self._args, **self._kwargs)
@@ -0,0 +1,2 @@
1
+ class MagicCommandException(Exception):
2
+ pass
@@ -0,0 +1,71 @@
1
+ import re
2
+ from typing import Dict, Tuple, List
3
+
4
+ from . import MagicCommand, MagicCommandException, MagicCommandCallback
5
+
6
+
7
+ class MagicCommandHandler:
8
+ def __init__(self):
9
+ self._magics: Dict[str, MagicCommand] = {}
10
+
11
+ def add(self, *command: MagicCommand):
12
+ for cmd in command:
13
+ for key in cmd.names:
14
+ key = key.lower()
15
+ self._magics[key] = cmd
16
+
17
+ def __call__(self, silent: bool, code: str) -> Tuple[str, List[MagicCommandCallback], List[MagicCommandCallback]]:
18
+ pre_query_callbacks = []
19
+ post_query_callbacks = []
20
+
21
+ while True:
22
+ # ensure code starts with '%' or '%%' but not with '%%%'
23
+ match = re.match(r'^%{1,2}([^% ]+?)($| .+?$)', code, re.MULTILINE | re.IGNORECASE)
24
+
25
+ if match is None:
26
+ break
27
+
28
+ # remove magic command from code
29
+ start, end = match.span()
30
+ code = code[:start] + code[end + 1:]
31
+
32
+ # extract command
33
+ command = match.group(1).lower()
34
+
35
+ if command not in self._magics:
36
+ raise MagicCommandException(f'unknown magic command "{command}"')
37
+
38
+ magic = self._magics[command]
39
+
40
+ # extract parameters
41
+ params = match.group(2)
42
+ match = re.match(magic.parameters, params, re.IGNORECASE)
43
+
44
+ if match is None:
45
+ raise MagicCommandException(f'could not parse parameters for command "{command}"')
46
+
47
+ # extract args
48
+ args = [g for g, _ in zip(match.groups(), magic.args)]
49
+
50
+ # extract kwargs
51
+ kwargs = {name: default for name, default, _ in magic.kwargs}
52
+
53
+ i = len(args) + 1
54
+ while i < len(match.groups()):
55
+ name = match.group(i + 1)
56
+ value = match.group(i + 2)
57
+ i += 3
58
+
59
+ if name is not None:
60
+ kwargs[name.lower()] = value
61
+
62
+ # add to callbacks
63
+ callback = MagicCommandCallback(magic, silent, *args, **kwargs)
64
+
65
+ if not magic.requires_query_result:
66
+ pre_query_callbacks.append(callback)
67
+ else:
68
+ post_query_callbacks.append(callback)
69
+
70
+ # return callbacks
71
+ return code, pre_query_callbacks, post_query_callbacks
@@ -0,0 +1,4 @@
1
+ from .MagicCommand import MagicCommand
2
+ from .MagicCommandCallback import MagicCommandCallback
3
+ from .MagicCommandException import MagicCommandException
4
+ from .MagicCommandHandler import MagicCommandHandler
File without changes
@@ -0,0 +1,26 @@
1
+ from typing import List
2
+
3
+ import checkmarkandcross
4
+
5
+
6
+ def row_count(count: int) -> str:
7
+ return f'{count} row{"" if count == 1 else "s"}'
8
+
9
+
10
+ def rows_table(rows: List[List]) -> str:
11
+ return ''.join(map(
12
+ lambda row: '<tr>' + ''.join(map(lambda e: f'<td>{e}</td>', row)) + '</tr>',
13
+ rows
14
+ ))
15
+
16
+
17
+ def wrap_image(val: bool, msg: str = '') -> str:
18
+ image = checkmarkandcross.image_html(val, size=24, title=msg)
19
+ return f'''
20
+ <div style="display: flex; align-items: center; margin-top: 0.5rem">
21
+ {image}
22
+ <span style="margin-left: 0.5rem">
23
+ {msg}
24
+ </span>
25
+ </div>
26
+ '''
@@ -0,0 +1,18 @@
1
+ import re
2
+
3
+ from .Table import Table
4
+
5
+
6
+ class Column:
7
+ def __init__(self, table: Table, name: str, data_type: str):
8
+ self.table: Table = table
9
+ self.name: str = name
10
+ self.data_type: str = data_type
11
+
12
+ def __hash__(self):
13
+ return self.name.__hash__()
14
+
15
+ @property
16
+ def id(self) -> str:
17
+ name = re.sub(r'[^A-Za-z]', '_', self.name)
18
+ return f'{self.table.id}_column_{name}'
@@ -0,0 +1,11 @@
1
+ from typing import Tuple
2
+
3
+ from . import Column
4
+ from . import Table
5
+
6
+
7
+ class Constraint:
8
+ def __init__(self, index: int, table: Table, columns: Tuple['Column', ...]):
9
+ self.index: int = index
10
+ self.table: Table = table
11
+ self.columns: Tuple['Column', ...] = columns
@@ -0,0 +1,15 @@
1
+ from typing import Tuple, Iterator
2
+
3
+ from . import Column
4
+ from . import Constraint
5
+
6
+
7
+ class ForeignKey:
8
+ def __init__(self, columns: Tuple['Column', ...], constraint: Constraint):
9
+ self.columns: Tuple['Column', ...] = columns
10
+ self.constraint: Constraint = constraint
11
+
12
+ @property
13
+ def references(self) -> Iterator[Tuple['Column', 'Column']]:
14
+ for source, target in zip(self.columns, self.constraint.columns):
15
+ yield source, target
@@ -0,0 +1,27 @@
1
+ import re
2
+ from typing import List, Optional
3
+
4
+ from . import Column
5
+ from . import ForeignKey
6
+ from .Constraint import Constraint
7
+
8
+
9
+ class Table:
10
+ def __init__(self, name: str):
11
+ self.name: str = name
12
+ self.columns: List[Column] = []
13
+ self.primary_key: Optional[Constraint] = None
14
+ self.unique_keys: List[Constraint] = []
15
+ self.foreign_keys: List[ForeignKey] = []
16
+
17
+ @property
18
+ def id(self) -> str:
19
+ name = re.sub(r'[^A-Za-z]', '_', self.name)
20
+ return f'table_{name}'
21
+
22
+ def get_column(self, name: str) -> "Column":
23
+ for column in self.columns:
24
+ if column.name == name:
25
+ return column
26
+
27
+ raise AssertionError(f'could not find column {name} in table {self.name}')
@@ -0,0 +1,219 @@
1
+ from typing import Dict, List
2
+
3
+ from duckdb import DuckDBPyConnection
4
+ from graphviz import Digraph
5
+
6
+ from . import Constraint, Column, ForeignKey, Table
7
+
8
+
9
+ class VizDrawer:
10
+ def __init__(self, con: DuckDBPyConnection):
11
+ self.tables: List[Table] = []
12
+
13
+ tables: Dict[str, Table] = {}
14
+ constraints: Dict[int, Constraint] = {}
15
+
16
+ # Get table names first. In the columns table we can not filter
17
+ # for base tables and some of the tables might not be contained
18
+ # in the constraints' information.
19
+ for table_name, in con.execute('''
20
+ SELECT table_name
21
+ FROM information_schema.tables
22
+ WHERE table_type == 'BASE TABLE'
23
+ ''').fetchall():
24
+ table = Table(table_name)
25
+
26
+ self.tables.append(table)
27
+ tables[table_name] = table
28
+
29
+ # Get column names and data types for each table.
30
+ for table_name, column_name, data_type in con.execute('''
31
+ SELECT
32
+ table_name,
33
+ column_name,
34
+ data_type
35
+ FROM information_schema.columns
36
+ ORDER BY ordinal_position ASC
37
+ ''').fetchall():
38
+ if table_name in tables:
39
+ table = tables[table_name]
40
+
41
+ column = Column(table, column_name, data_type)
42
+ table.columns.append(column)
43
+
44
+ # Find primary keys.
45
+ for table_name, constraint_index, constraint_columns in con.execute('''
46
+ SELECT
47
+ table_name,
48
+ constraint_index,
49
+ constraint_column_names
50
+ FROM duckdb_constraints()
51
+ WHERE constraint_type = 'PRIMARY KEY'
52
+ ORDER BY constraint_index ASC
53
+ ''').fetchall():
54
+ # get table
55
+ if table_name not in tables:
56
+ raise AssertionError(f'unknown table {table_name} for constraint {constraint_index}')
57
+
58
+ table = tables[table_name]
59
+
60
+ # store constraint
61
+ if constraint_index in constraints:
62
+ raise AssertionError(f'constraint with index {constraint_index} already stored')
63
+
64
+ constraint = Constraint(
65
+ constraint_index,
66
+ table,
67
+ tuple(table.get_column(c) for c in constraint_columns)
68
+ )
69
+ constraints[constraint_index] = constraint
70
+
71
+ # store key
72
+ if table.primary_key is not None:
73
+ raise AssertionError(f'discovered second primary key for table {table_name}')
74
+
75
+ table.primary_key = constraint
76
+
77
+ # Find unique keys.
78
+ for table_name, constraint_index, constraint_columns in con.execute('''
79
+ SELECT
80
+ table_name,
81
+ constraint_index,
82
+ constraint_column_names
83
+ FROM duckdb_constraints()
84
+ WHERE constraint_type = 'UNIQUE'
85
+ ORDER BY constraint_index ASC
86
+ ''').fetchall():
87
+ # get table
88
+ if table_name not in tables:
89
+ raise AssertionError(f'unknown table {table_name} for constraint {constraint_index}')
90
+
91
+ table = tables[table_name]
92
+
93
+ # store constraint
94
+ if constraint_index in constraints:
95
+ raise AssertionError(f'constraint with index {constraint_index} already stored')
96
+
97
+ constraint = Constraint(
98
+ constraint_index,
99
+ table,
100
+ tuple(table.get_column(c) for c in constraint_columns)
101
+ )
102
+ constraints[constraint_index] = constraint
103
+
104
+ # store key
105
+ table.unique_keys.append(constraint)
106
+
107
+ # Find foreign keys.
108
+ for table_name, constraint_index, constraint_columns in con.execute('''
109
+ SELECT
110
+ table_name,
111
+ constraint_index,
112
+ constraint_column_names
113
+ FROM duckdb_constraints()
114
+ WHERE constraint_type = 'FOREIGN KEY'
115
+ ORDER BY constraint_index ASC
116
+ ''').fetchall():
117
+ # get table
118
+ if table_name not in tables:
119
+ raise AssertionError(f'unknown table {table_name} for constraint {constraint_index}')
120
+
121
+ table = tables[table_name]
122
+
123
+ # lookup constraint
124
+ if constraint_index not in constraints:
125
+ raise AssertionError(f'constraint with index {constraint_index} not discovered previously')
126
+
127
+ constraint = constraints[constraint_index]
128
+
129
+ # store key
130
+ key = ForeignKey(tuple(table.get_column(c) for c in constraint_columns), constraint)
131
+ table.foreign_keys.append(key)
132
+
133
+ def to_graph(self) -> Digraph:
134
+ # create graph
135
+ ps = Digraph('Schema',
136
+ graph_attr={},
137
+ node_attr={
138
+ 'shape': 'plaintext'
139
+ })
140
+
141
+ # add nodes
142
+ fk_counter: Dict[str, int] = {}
143
+
144
+ for table in self.tables:
145
+ columns = "\n".join(self.__column_to_html(table, column, fk_counter) for column in table.columns)
146
+
147
+ ps.node(
148
+ table.id,
149
+ f'''<
150
+ <table border="0" cellborder="1" cellspacing="0" cellpadding="5">
151
+ <tr>
152
+ <td><b>{table.name}</b></td>
153
+ </tr>
154
+ <tr>
155
+ <td>
156
+ <table border="0" cellborder="0" cellspacing="0">
157
+ {columns}
158
+ </table>
159
+ </td>
160
+ </tr>
161
+ </table>
162
+ >'''
163
+ )
164
+
165
+ # add edges
166
+ for source_table in self.tables:
167
+ for key in source_table.foreign_keys:
168
+ target_table = key.constraint.table
169
+ fk_counter_key = f'{source_table.name}_{key.constraint.index}'
170
+
171
+ ps.edge(source_table.id, target_table.id, label=f'FK{fk_counter[fk_counter_key]}', arrowhead='vee')
172
+
173
+ # return graph
174
+ return ps
175
+
176
+ def to_svg(self, lr: bool) -> str:
177
+ ps = self.to_graph()
178
+ if lr:
179
+ ps.graph_attr['rankdir'] = 'LR'
180
+
181
+ return ps.pipe(format='svg').decode('utf-8')
182
+
183
+ @staticmethod
184
+ def __column_to_html(table: Table, column: Column, fk_counter: Dict[str, int]):
185
+ name = column.name
186
+ data_type = column.data_type
187
+
188
+ # extract and style column name
189
+ if table.primary_key is not None and column in table.primary_key.columns:
190
+ name = f'<b>{name}</b>'
191
+ for key in table.unique_keys:
192
+ if column in key.columns:
193
+ name = f'<u>{name}</u>'
194
+ break
195
+
196
+ # extract foreign keys
197
+ fk = []
198
+ for key in table.foreign_keys:
199
+ if column in key.columns:
200
+ fk_counter_key = f'{table.name}_{key.constraint.index}'
201
+ if fk_counter_key not in fk_counter:
202
+ fk_counter[fk_counter_key] = max(*fk_counter.values(), 0, 0) + 1
203
+
204
+ fk.append(fk_counter[fk_counter_key])
205
+
206
+ if len(fk) > 0:
207
+ fk = map(lambda x: f'(FK{x})', sorted(fk))
208
+ fk = f'<i>{" ".join(fk)}</i>'
209
+ else:
210
+ fk = ''
211
+
212
+ # convert to html
213
+ return f'''
214
+ <tr port="{column.id}">
215
+ <td align="left">{name}</td>
216
+ <td align="left">: {data_type}</td>
217
+ <td align="left">{fk}</td>
218
+ </tr>
219
+ '''
@@ -0,0 +1,5 @@
1
+ from .Column import Column
2
+ from .Constraint import Constraint
3
+ from .ForeignKey import ForeignKey
4
+ from .Table import Table
5
+ from .VizDrawer import VizDrawer
@@ -0,0 +1,202 @@
1
+ Metadata-Version: 2.1
2
+ Name: jupyter-duckdb
3
+ Version: 0.4.1
4
+ Summary: a basic wrapper kernel for DuckDB
5
+ Home-page: https://github.com/erictroebs/jupyter-duckdb
6
+ Author: Eric Tröbs
7
+ Author-email: eric.troebs@tu-ilmenau.de
8
+ Project-URL: Bug Tracker, https://github.com/erictroebs/jupyter-duckdb/issues
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Operating System :: OS Independent
12
+ Requires-Python: >=3.7
13
+ Description-Content-Type: text/markdown
14
+ Requires-Dist: jupyter
15
+ Requires-Dist: duckdb ==0.8.1
16
+ Requires-Dist: graphviz ==0.20.1
17
+ Requires-Dist: checkmarkandcross
18
+
19
+ # DuckDB Kernel for Jupyter
20
+
21
+ This is a simple DuckDB wrapper kernel which accepts SQL as input, executes it using a previously loaded DuckDB instance
22
+ and formats the output as a table. There are some magic commands that make teaching easier with this kernel.
23
+
24
+ ## Quick Start
25
+
26
+ [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/git/https%3A%2F%2Fdbgit.prakinf.tu-ilmenau.de%2Fertr8623%2Fjupyter-duckdb.git/master)
27
+
28
+ ## Table of Contents
29
+
30
+ - [Setup](#setup)
31
+ - [Using pip](#using-pip)
32
+ - [Using Docker](#using-docker)
33
+ - [Usage](#usage)
34
+ - [A Note on Magic Commands](#a-note-on-magic-commands)
35
+ - [Load a Database](#load-a-database)
36
+ - [Schema Diagrams](#schema-diagrams)
37
+ - [Number of Rows](#number-of-rows)
38
+ - [Ship Tests With Your Notebook](#ship-tests-with-your-notebooks)
39
+
40
+ ## Setup
41
+
42
+ ### Using pip
43
+
44
+ Run `pip` to install the corresponding package from [pypi](https://pypi.org/project/jupyter-duckdb/) **after**
45
+ Jupyter is already installed.
46
+
47
+ ```bash
48
+ pip install jupyter-duckdb
49
+ ```
50
+
51
+ Register the kernel.
52
+
53
+ ```bash
54
+ jupyter kernelspec install <path to the site-packages directory>/duckdb_kernel
55
+ ```
56
+
57
+ Now start Jupyter the usual way and the kernel should be available.
58
+
59
+ ### Using Docker
60
+
61
+ Execute the following command to pull a and run a prepared image.
62
+
63
+ ```bash
64
+ docker run -p 8888:8888 troebs/jupyter:duckdb
65
+ ```
66
+
67
+ This image can also be used with JupyterHub and the
68
+ [DockerSpawner / SwarmSpawner](https://github.com/jupyterhub/dockerspawner)
69
+ and probably with the
70
+ [kubespawner](https://github.com/jupyterhub/kubespawner).
71
+ You can also build your own image using the [Dockerfile](Dockerfile) in the repository.
72
+
73
+ ## Usage
74
+
75
+ A detailed example can be found [in the repository](example/). The rest of this section describes the magic commands.
76
+
77
+ ### A Note on Magic Commands
78
+
79
+ Many Jupyter kernels make a difference between magic commands for a single line starting with one percent sign and
80
+ others for a whole cell starting with two percent signs. The upcoming magic commands always apply to a whole cell.
81
+ Therefore, it does not matter whether you use a single or two percent signs. However, the magic commands must always
82
+ be used at the beginning of a cell.
83
+
84
+ It is also possible to use more than one magic command per cell.
85
+
86
+ ### Load a Database
87
+
88
+ To load the database two magic commands are available.
89
+
90
+ `CREATE` creates a new database and therefore overwrites files with the same name without prompting. Using the optional
91
+ parameter `OF` you can either provide another DuckDB file or a file with SQL statements. In the first case the included
92
+ tables will be copied to the new database, while in the second case the SQL statements are just executed. We find this
93
+ feature very useful to work in a temporary copy of the data and therefore be able to restart at any time. The last
94
+ optional parameter `WITH_TESTS` is described in detail [below](#ship-tests-with-your-notebooks).
95
+
96
+ ```
97
+ %CREATE data.duckdb OF my_statements.sql
98
+ ```
99
+
100
+ `LOAD` on the other hand loads an existing database and returns an error if it does not exist. (That is why `OF` cannot
101
+ be used with `LOAD`! `WITH_TESTS` on the other hand is available also with this magic command.)
102
+
103
+ ```
104
+ %LOAD data.duckdb
105
+ ```
106
+
107
+ Only one database can be open at any time. If a new database is created or loaded, the current one is closed first and
108
+ saved to disk if necessary.
109
+
110
+ Please note that `:memory:` is also a valid file path for DuckDB. The data is then stored exclusively in the main
111
+ memory. In combination with `CREATE` and `OF` this makes it possible to work on a temporary copy in memory.
112
+
113
+ ### Schema Diagrams
114
+
115
+ The magic command `SCHEMA` can be used to create a simple schema diagram of the loaded database, showing all created
116
+ tables, their columns and data types, but without any views. Primary keys are printed in bold and unique keys are
117
+ underlined. Foreign keys are also highlighted and the dependencies between the tables are shown by arrows.
118
+
119
+ The optional parameter `LR` can be set to a true value to force a horizontal layout. This saves visual space especially
120
+ for larger amounts of tables.
121
+
122
+ ```
123
+ %SCHEMA LR 1
124
+ ```
125
+
126
+ ### Number of Rows
127
+
128
+ By default, only 20 rows are shown. All further lines are replaced by three dots. When hovering over the three dots
129
+ using the cursor, the number of omitted lines is displayed. Of course, the number of lines displayed can be changed.
130
+
131
+ The magic command `ALL_ROWS` and its short form `ALL` can be used to display **all** rows of the query in the same
132
+ cell. **Caution**: With large result sets this can lead to a frozen Jupyter instance.
133
+
134
+ ```sql
135
+ %ALL_ROWS
136
+ SELECT *
137
+ FROM foo
138
+ -- all rows
139
+ ```
140
+
141
+ The magic command `QUERY_MAX_ROWS` followed by an integer can be used to change the number of displayed rows for the
142
+ current cell.
143
+
144
+ ```sql
145
+ %QUERY_MAX_ROWS 50
146
+ SELECT *
147
+ FROM foo
148
+ -- 50 rows
149
+ ```
150
+
151
+ The magic command `MAX_ROWS` followed by an integer can be used to change the number of displayed rows for all future
152
+ queries including the current cell.
153
+
154
+ ```sql
155
+ %MAX_ROWS 30
156
+ SELECT *
157
+ FROM foo
158
+ -- 30 rows
159
+ ```
160
+
161
+ ```sql
162
+ SELECT *
163
+ FROM bar
164
+ -- 30 rows
165
+ ```
166
+
167
+ ### Ship Tests With Your Notebooks
168
+
169
+ Simple tests can be loaded together with the database with the help of the `WITH_TESTS` parameter. These tests are
170
+ stored as a JSON file. Each test is assigned a unique name, a result set and whether the test should check the order
171
+ of the result. A very simple test file looks like the following JSON object:
172
+
173
+ ```json
174
+ {
175
+ "task1": {
176
+ "ordered": false,
177
+ "equals": [
178
+ [
179
+ 1,
180
+ "Name 1"
181
+ ],
182
+ [
183
+ 2,
184
+ "Name 2"
185
+ ]
186
+ ]
187
+ }
188
+ }
189
+ ```
190
+
191
+ To bind a test to a cell, use the magic command `TEST` in combination with a name. After the cell is executed, the
192
+ result is evaluated and then displayed below the query result.
193
+
194
+ ```sql
195
+ %TEST task1
196
+ SELECT 2, 'Name 2'
197
+ UNION
198
+ SELECT 1, 'Name 1'
199
+ ```
200
+
201
+ Disclaimer: The integrated testing is work-in-progress and thus subject to potentially incompatible changes and
202
+ enhancements.
@@ -0,0 +1,21 @@
1
+ duckdb_kernel/__init__.py,sha256=6auU6zeJrsA4fxPSr2PYamS8fG-SMXTn5YQFXF2cseo,33
2
+ duckdb_kernel/__main__.py,sha256=Z3GwHEBWoQjNm2Y84ijnbA0Lk66L7nsFREuqhZ_ptk0,165
3
+ duckdb_kernel/kernel.json,sha256=_7E8Ci2FSdCvnzCjsOaue8QE8AvpS5JLQuxORO5IGtA,127
4
+ duckdb_kernel/kernel.py,sha256=n83u1M3I2dID_CxZRp9atQq1yk168NwICAJo6nVyRKs,13196
5
+ duckdb_kernel/magics/MagicCommand.py,sha256=d4Chj2G9CfX18Y5ZcH5E_Ovx0fueh-Eq54nLH--cgis,1779
6
+ duckdb_kernel/magics/MagicCommandCallback.py,sha256=sCGsUbQUmUctGpBQRtkca44tYCLI8u4Spo6ntMggmFc,706
7
+ duckdb_kernel/magics/MagicCommandException.py,sha256=MwuWkpA6NoCqz437urdI0RVXhbSbVdziuRoi7slYFPc,49
8
+ duckdb_kernel/magics/MagicCommandHandler.py,sha256=V47ef_nWptg7ClwNPKaEVxjQ5prAcMpCk5jXI29RpPA,2319
9
+ duckdb_kernel/magics/__init__.py,sha256=DA8gnQeRCUt1Scy3_NQ9w5CPmMEY9i8YwB-g392pN1U,204
10
+ duckdb_kernel/util/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
+ duckdb_kernel/util/formatting.py,sha256=rxY6rBF-p_mk_HS1Z2PrHelJ-IElxYl6GLaDS9hZJ1U,653
12
+ duckdb_kernel/visualization/Column.py,sha256=UXHxczsT6HalANH0CaklEVCyJZg1l0cmq-KGRWXt2-A,422
13
+ duckdb_kernel/visualization/Constraint.py,sha256=1YgUHk7s8mHCVedbcuJKyXDykj7_ybbwT3Dk9p2VMis,287
14
+ duckdb_kernel/visualization/ForeignKey.py,sha256=iurUAXwTwSIpLXsL0B7BA8jqDTfW4_wkeHxoqQbZwiU,470
15
+ duckdb_kernel/visualization/Table.py,sha256=Jv9un_oX-nupx2EqzJDn_UHtAwddgFGSEapho2kIDrY,756
16
+ duckdb_kernel/visualization/VizDrawer.py,sha256=435Ejrp4nEnlnnL2-cu9IHUmgkhSSQQ-04EJtbm3T8g,7568
17
+ duckdb_kernel/visualization/__init__.py,sha256=BfWfACqoxtagVQxK1eAM2r_VbxDf0psPO_0fQWCiiro,155
18
+ jupyter_duckdb-0.4.1.dist-info/METADATA,sha256=CvKQQeaSEgPzWIoLXv1UkRVmeaAsqY7i-VMrA2GDqoU,6563
19
+ jupyter_duckdb-0.4.1.dist-info/WHEEL,sha256=yQN5g4mg4AybRjkgi-9yy4iQEFibGQmlz78Pik5Or-A,92
20
+ jupyter_duckdb-0.4.1.dist-info/top_level.txt,sha256=KvRRPMnmkQNuhyBsXoPmwyt26LRDp0O-0HN6u0Dm5jA,14
21
+ jupyter_duckdb-0.4.1.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.38.4)
2
+ Generator: bdist_wheel (0.41.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,17 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: jupyter-duckdb
3
- Version: 0.3.2
4
- Summary: a basic wrapper kernel for DuckDB
5
- Home-page: https://github.com/erictroebs/jupyter-duckdb
6
- Author: Eric Tröbs
7
- Author-email: eric.troebs@tu-ilmenau.de
8
- Project-URL: Bug Tracker, https://github.com/erictroebs/jupyter-duckdb/issues
9
- Classifier: Programming Language :: Python :: 3
10
- Classifier: License :: OSI Approved :: MIT License
11
- Classifier: Operating System :: OS Independent
12
- Requires-Python: >=3.6
13
- Description-Content-Type: text/markdown
14
- Requires-Dist: jupyter
15
- Requires-Dist: duckdb (==0.6.1)
16
-
17
- # DuckDB Kernel for Jupyter
@@ -1,8 +0,0 @@
1
- duckdb_kernel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- duckdb_kernel/__main__.py,sha256=Z3GwHEBWoQjNm2Y84ijnbA0Lk66L7nsFREuqhZ_ptk0,165
3
- duckdb_kernel/kernel.json,sha256=_7E8Ci2FSdCvnzCjsOaue8QE8AvpS5JLQuxORO5IGtA,127
4
- duckdb_kernel/kernel.py,sha256=cYMSgJgcTjPOMvxZuXS9wsXZpP2TsovzTovI2VYOgQY,10762
5
- jupyter_duckdb-0.3.2.dist-info/METADATA,sha256=QQ2rgkWRmphAfVzlIfM-cLyg0s3bkz8e3jOUwmFrxEM,588
6
- jupyter_duckdb-0.3.2.dist-info/WHEEL,sha256=2wepM1nk4DS4eFpYrW1TTqPcoGNfHhhO_i5m4cOimbo,92
7
- jupyter_duckdb-0.3.2.dist-info/top_level.txt,sha256=KvRRPMnmkQNuhyBsXoPmwyt26LRDp0O-0HN6u0Dm5jA,14
8
- jupyter_duckdb-0.3.2.dist-info/RECORD,,