real-ladybug 0.0.1.dev1__cp312-cp312-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of real-ladybug might be problematic. Click here for more details.

Files changed (114) hide show
  1. real_ladybug/__init__.py +83 -0
  2. real_ladybug/_lbug.cp312-win_amd64.pyd +0 -0
  3. real_ladybug/_lbug.exp +0 -0
  4. real_ladybug/_lbug.lib +0 -0
  5. real_ladybug/async_connection.py +226 -0
  6. real_ladybug/connection.py +323 -0
  7. real_ladybug/constants.py +7 -0
  8. real_ladybug/database.py +307 -0
  9. real_ladybug/prepared_statement.py +51 -0
  10. real_ladybug/py.typed +0 -0
  11. real_ladybug/query_result.py +511 -0
  12. real_ladybug/torch_geometric_feature_store.py +185 -0
  13. real_ladybug/torch_geometric_graph_store.py +131 -0
  14. real_ladybug/torch_geometric_result_converter.py +282 -0
  15. real_ladybug/types.py +39 -0
  16. real_ladybug-0.0.1.dev1.dist-info/METADATA +88 -0
  17. real_ladybug-0.0.1.dev1.dist-info/RECORD +114 -0
  18. real_ladybug-0.0.1.dev1.dist-info/WHEEL +5 -0
  19. real_ladybug-0.0.1.dev1.dist-info/licenses/LICENSE +21 -0
  20. real_ladybug-0.0.1.dev1.dist-info/top_level.txt +3 -0
  21. real_ladybug-0.0.1.dev1.dist-info/zip-safe +1 -0
  22. real_ladybug-source/scripts/antlr4/hash.py +2 -0
  23. real_ladybug-source/scripts/antlr4/keywordhandler.py +47 -0
  24. real_ladybug-source/scripts/collect-extensions.py +68 -0
  25. real_ladybug-source/scripts/collect-single-file-header.py +126 -0
  26. real_ladybug-source/scripts/export-dbs.py +101 -0
  27. real_ladybug-source/scripts/export-import-test.py +345 -0
  28. real_ladybug-source/scripts/extension/purge-beta.py +34 -0
  29. real_ladybug-source/scripts/generate-cpp-docs/collect_files.py +122 -0
  30. real_ladybug-source/scripts/generate-tinysnb.py +34 -0
  31. real_ladybug-source/scripts/get-clangd-diagnostics.py +233 -0
  32. real_ladybug-source/scripts/migrate-lbug-db.py +308 -0
  33. real_ladybug-source/scripts/multiplatform-test-helper/collect-results.py +71 -0
  34. real_ladybug-source/scripts/multiplatform-test-helper/notify-discord.py +68 -0
  35. real_ladybug-source/scripts/pip-package/package_tar.py +90 -0
  36. real_ladybug-source/scripts/pip-package/setup.py +130 -0
  37. real_ladybug-source/scripts/run-clang-format.py +408 -0
  38. real_ladybug-source/scripts/setup-extension-repo.py +67 -0
  39. real_ladybug-source/scripts/test-simsimd-dispatch.py +45 -0
  40. real_ladybug-source/scripts/update-nightly-build-version.py +81 -0
  41. real_ladybug-source/third_party/brotli/scripts/dictionary/step-01-download-rfc.py +16 -0
  42. real_ladybug-source/third_party/brotli/scripts/dictionary/step-02-rfc-to-bin.py +34 -0
  43. real_ladybug-source/third_party/brotli/scripts/dictionary/step-03-validate-bin.py +35 -0
  44. real_ladybug-source/third_party/brotli/scripts/dictionary/step-04-generate-java-literals.py +85 -0
  45. real_ladybug-source/third_party/pybind11/tools/codespell_ignore_lines_from_errors.py +35 -0
  46. real_ladybug-source/third_party/pybind11/tools/libsize.py +36 -0
  47. real_ladybug-source/third_party/pybind11/tools/make_changelog.py +63 -0
  48. real_ladybug-source/tools/python_api/build/real_ladybug/__init__.py +83 -0
  49. real_ladybug-source/tools/python_api/build/real_ladybug/async_connection.py +226 -0
  50. real_ladybug-source/tools/python_api/build/real_ladybug/connection.py +323 -0
  51. real_ladybug-source/tools/python_api/build/real_ladybug/constants.py +7 -0
  52. real_ladybug-source/tools/python_api/build/real_ladybug/database.py +307 -0
  53. real_ladybug-source/tools/python_api/build/real_ladybug/prepared_statement.py +51 -0
  54. real_ladybug-source/tools/python_api/build/real_ladybug/py.typed +0 -0
  55. real_ladybug-source/tools/python_api/build/real_ladybug/query_result.py +511 -0
  56. real_ladybug-source/tools/python_api/build/real_ladybug/torch_geometric_feature_store.py +185 -0
  57. real_ladybug-source/tools/python_api/build/real_ladybug/torch_geometric_graph_store.py +131 -0
  58. real_ladybug-source/tools/python_api/build/real_ladybug/torch_geometric_result_converter.py +282 -0
  59. real_ladybug-source/tools/python_api/build/real_ladybug/types.py +39 -0
  60. real_ladybug-source/tools/python_api/src_py/__init__.py +83 -0
  61. real_ladybug-source/tools/python_api/src_py/async_connection.py +226 -0
  62. real_ladybug-source/tools/python_api/src_py/connection.py +323 -0
  63. real_ladybug-source/tools/python_api/src_py/constants.py +7 -0
  64. real_ladybug-source/tools/python_api/src_py/database.py +307 -0
  65. real_ladybug-source/tools/python_api/src_py/prepared_statement.py +51 -0
  66. real_ladybug-source/tools/python_api/src_py/py.typed +0 -0
  67. real_ladybug-source/tools/python_api/src_py/query_result.py +511 -0
  68. real_ladybug-source/tools/python_api/src_py/torch_geometric_feature_store.py +185 -0
  69. real_ladybug-source/tools/python_api/src_py/torch_geometric_graph_store.py +131 -0
  70. real_ladybug-source/tools/python_api/src_py/torch_geometric_result_converter.py +282 -0
  71. real_ladybug-source/tools/python_api/src_py/types.py +39 -0
  72. real_ladybug-source/tools/python_api/test/conftest.py +230 -0
  73. real_ladybug-source/tools/python_api/test/disabled_test_extension.py +73 -0
  74. real_ladybug-source/tools/python_api/test/ground_truth.py +430 -0
  75. real_ladybug-source/tools/python_api/test/test_arrow.py +694 -0
  76. real_ladybug-source/tools/python_api/test/test_async_connection.py +159 -0
  77. real_ladybug-source/tools/python_api/test/test_blob_parameter.py +145 -0
  78. real_ladybug-source/tools/python_api/test/test_connection.py +49 -0
  79. real_ladybug-source/tools/python_api/test/test_database.py +234 -0
  80. real_ladybug-source/tools/python_api/test/test_datatype.py +372 -0
  81. real_ladybug-source/tools/python_api/test/test_df.py +564 -0
  82. real_ladybug-source/tools/python_api/test/test_dict.py +112 -0
  83. real_ladybug-source/tools/python_api/test/test_exception.py +54 -0
  84. real_ladybug-source/tools/python_api/test/test_fsm.py +227 -0
  85. real_ladybug-source/tools/python_api/test/test_get_header.py +49 -0
  86. real_ladybug-source/tools/python_api/test/test_helper.py +8 -0
  87. real_ladybug-source/tools/python_api/test/test_issue.py +147 -0
  88. real_ladybug-source/tools/python_api/test/test_iteration.py +96 -0
  89. real_ladybug-source/tools/python_api/test/test_networkx.py +437 -0
  90. real_ladybug-source/tools/python_api/test/test_parameter.py +340 -0
  91. real_ladybug-source/tools/python_api/test/test_prepared_statement.py +117 -0
  92. real_ladybug-source/tools/python_api/test/test_query_result.py +54 -0
  93. real_ladybug-source/tools/python_api/test/test_query_result_close.py +44 -0
  94. real_ladybug-source/tools/python_api/test/test_scan_pandas.py +676 -0
  95. real_ladybug-source/tools/python_api/test/test_scan_pandas_pyarrow.py +714 -0
  96. real_ladybug-source/tools/python_api/test/test_scan_polars.py +165 -0
  97. real_ladybug-source/tools/python_api/test/test_scan_pyarrow.py +167 -0
  98. real_ladybug-source/tools/python_api/test/test_timeout.py +11 -0
  99. real_ladybug-source/tools/python_api/test/test_torch_geometric.py +640 -0
  100. real_ladybug-source/tools/python_api/test/test_torch_geometric_remote_backend.py +111 -0
  101. real_ladybug-source/tools/python_api/test/test_udf.py +207 -0
  102. real_ladybug-source/tools/python_api/test/test_version.py +6 -0
  103. real_ladybug-source/tools/python_api/test/test_wal.py +80 -0
  104. real_ladybug-source/tools/python_api/test/type_aliases.py +10 -0
  105. real_ladybug-source/tools/rust_api/update_version.py +47 -0
  106. real_ladybug-source/tools/shell/test/conftest.py +218 -0
  107. real_ladybug-source/tools/shell/test/test_helper.py +60 -0
  108. real_ladybug-source/tools/shell/test/test_shell_basics.py +325 -0
  109. real_ladybug-source/tools/shell/test/test_shell_commands.py +656 -0
  110. real_ladybug-source/tools/shell/test/test_shell_control_edit.py +438 -0
  111. real_ladybug-source/tools/shell/test/test_shell_control_search.py +468 -0
  112. real_ladybug-source/tools/shell/test/test_shell_esc_edit.py +232 -0
  113. real_ladybug-source/tools/shell/test/test_shell_esc_search.py +162 -0
  114. real_ladybug-source/tools/shell/test/test_shell_flags.py +645 -0
@@ -0,0 +1,714 @@
1
+ import math
2
+ import random
3
+ import re
4
+ import struct
5
+ from datetime import datetime, timedelta
6
+ from decimal import Decimal
7
+ from pathlib import Path
8
+
9
+ import real_ladybug as lb
10
+ import pandas as pd
11
+ import pyarrow as pa
12
+ import pytest
13
+ from pandas.arrays import ArrowExtensionArray as arrowtopd
14
+ from type_aliases import ConnDB
15
+
16
+
17
+ def generate_primitive(dtype):
18
+ if random.randrange(0, 5) == 0:
19
+ return None
20
+ if dtype.startswith("bool"):
21
+ return random.randrange(0, 1) == 1
22
+ if dtype.startswith("int32"):
23
+ return random.randrange(-2147483648, 2147483647)
24
+ if dtype.startswith("int64"):
25
+ return random.randrange(-9223372036854775808, 9223372036854775807)
26
+ if dtype.startswith("uint64"):
27
+ return random.randrange(0, 18446744073709551615)
28
+ if dtype.startswith("float32"):
29
+ random_bits = random.getrandbits(32)
30
+ random_bytes = struct.pack("<I", random_bits)
31
+ random_float = struct.unpack("<f", random_bytes)[0]
32
+ return random_float
33
+ return -1
34
+
35
+
36
+ def generate_primitive_series(scale, dtype):
37
+ return pd.Series([generate_primitive(dtype) for i in range(scale)], dtype=dtype)
38
+
39
+
40
+ def generate_primitive_df(scale, names, schema):
41
+ return pd.DataFrame({name: generate_primitive_series(scale, dtype) for name, dtype in zip(names, schema)})
42
+
43
+
44
+ def set_thread_count(conn, cnt):
45
+ conn.execute(f"CALL THREADS={cnt}")
46
+
47
+
48
+ def tables_equal(t1, t2):
49
+ if t1.schema != t2.schema:
50
+ return False
51
+ if t1.num_rows != t2.num_rows:
52
+ return False
53
+ for col_name in t1.schema.names:
54
+ col1 = t1[col_name]
55
+ col2 = t2[col_name]
56
+ if col1 != col2:
57
+ return False
58
+ return True
59
+
60
+
61
+ def is_null(val):
62
+ if val is None:
63
+ return True
64
+ if type(val) is str:
65
+ return val == ""
66
+ if type(val) is pd._libs.missing.NAType:
67
+ return True
68
+ if type(val) is float:
69
+ return math.isnan(val)
70
+ return False
71
+
72
+
73
+ def pyarrow_test_helper(establish_connection, n, k):
74
+ conn, _ = establish_connection
75
+ names = ["boolcol", "int32col", "int64col", "uint64col", "floatcol"]
76
+ schema = ["bool[pyarrow]", "int32[pyarrow]", "int64[pyarrow]", "uint64[pyarrow]", "float32[pyarrow]"]
77
+ set_thread_count(conn, k)
78
+ random.seed(n * k)
79
+ df = generate_primitive_df(n, names, schema).sort_values(by=["int32col", "int64col", "uint64col", "floatcol"])
80
+ patable = pa.Table.from_pandas(df).select(names)
81
+ result = conn.execute(
82
+ "LOAD FROM df RETURN boolcol, int32col, int64col, uint64col, floatcol ORDER BY int32col, int64col, uint64col, floatcol"
83
+ ).get_as_arrow(n)
84
+ if not tables_equal(patable, result):
85
+ print(patable)
86
+ print("-" * 25)
87
+ print(result)
88
+ print("-" * 25)
89
+ pytest.fail("tables are not equal")
90
+
91
+ result = conn.execute(
92
+ "LOAD FROM $df RETURN boolcol, int32col, int64col, uint64col, floatcol ORDER BY int32col, int64col, uint64col, floatcol",
93
+ {"df": df},
94
+ ).get_as_arrow(n)
95
+ if not tables_equal(patable, result):
96
+ print(patable)
97
+ print("-" * 25)
98
+ print(result)
99
+ print("-" * 25)
100
+ pytest.fail("tables are not equal")
101
+
102
+
103
+ def test_pyarrow_primitive(conn_db_empty: ConnDB) -> None:
104
+ conn, db = conn_db_empty
105
+ establish_connection = (conn, db)
106
+ # stress tests primitive reading
107
+ sfs = [100, 2048, 4000, 9000, 16000]
108
+ threads = [1, 2, 5, 10]
109
+ for sf in sfs:
110
+ for thread in threads:
111
+ pyarrow_test_helper(establish_connection, sf, thread)
112
+
113
+
114
+ def test_pyarrow_time(conn_db_readonly: ConnDB) -> None:
115
+ conn, _ = conn_db_readonly
116
+ col1 = pa.array([1000123, 2000123, 3000123], type=pa.duration("s"))
117
+ col2 = pa.array([1000123000000, 2000123000000, 3000123000000], type=pa.duration("us"))
118
+ col3 = pa.array([1000123000000000, 2000123000000000, 3000123000000000], type=pa.duration("ns"))
119
+ col4 = pa.array([datetime(2012, 1, 20), datetime(2000, 12, 2), datetime(1987, 5, 27)], type=pa.timestamp("s"))
120
+ col5 = pa.array([datetime(2012, 1, 20), datetime(2000, 12, 2), datetime(1987, 5, 27)], type=pa.timestamp("s"))
121
+ col6 = pa.array([datetime(2012, 1, 20), datetime(2000, 12, 2), datetime(1987, 5, 27)], type=pa.timestamp("ms"))
122
+ col7 = pa.array([datetime(2012, 1, 20), datetime(2000, 12, 2), datetime(1987, 5, 27)], type=pa.timestamp("us"))
123
+ col8 = pa.array([datetime(2012, 1, 20), datetime(2000, 12, 2), datetime(1987, 5, 27)], type=pa.timestamp("ns"))
124
+ col9 = pa.array([datetime(2012, 1, 20), datetime(2000, 12, 2), datetime(1987, 5, 27)], type=pa.date32())
125
+ col10 = pa.array([datetime(2012, 1, 20), datetime(2000, 12, 2), datetime(1987, 5, 27)], type=pa.date64())
126
+ # not implemented by pandas
127
+ # col11 = pa.array([(1, 2, 3), (4, 5, -6), (100, 200, 1000000000)], type=pa.month_day_nano_interval())
128
+ # for some reason, pyarrow doesnt support the direct creation of pure month or pure datetime
129
+ # intervals, so that will remain untested for now
130
+ df = pd.DataFrame({
131
+ "col1": arrowtopd(col1),
132
+ "col2": arrowtopd(col2),
133
+ "col3": arrowtopd(col3),
134
+ "col4": arrowtopd(col4),
135
+ "col5": arrowtopd(col5),
136
+ "col6": arrowtopd(col6),
137
+ "col7": arrowtopd(col7),
138
+ "col8": arrowtopd(col8),
139
+ "col9": arrowtopd(col9),
140
+ "col10": arrowtopd(col10),
141
+ # 'col11': arrowtopd(col11)
142
+ })
143
+ result = conn.execute("LOAD FROM df RETURN *").get_as_df()
144
+ for colname in ["col1", "col2", "col3"]:
145
+ for expected, actual in zip(df[colname], result[colname]):
146
+ tmp1 = expected if type(expected) is timedelta else expected.to_pytimedelta()
147
+ tmp2 = actual if type(actual) is timedelta else actual.to_pytimedelta()
148
+ assert tmp1 == tmp2
149
+ for colname in ["col4", "col5", "col6", "col7", "col8"]:
150
+ for expected, actual in zip(df[colname], result[colname]):
151
+ tmp1 = expected if type(expected) is datetime else expected.to_pydatetime()
152
+ tmp2 = actual if type(actual) is datetime else actual.to_pydatetime()
153
+ assert tmp1 == tmp2
154
+ for colname in ["col9", "col10"]:
155
+ for expected, actual in zip(df[colname], result[colname]):
156
+ assert datetime.combine(expected, datetime.min.time()) == actual.to_pydatetime()
157
+
158
+
159
+ def generate_blob(length):
160
+ if random.randint(0, 5) == 0:
161
+ return None
162
+ return random.getrandbits(8 * length).to_bytes(length, "little")
163
+
164
+
165
+ def test_pyarrow_blob(conn_db_readonly: ConnDB) -> None:
166
+ conn, _ = conn_db_readonly
167
+ # blobs, blob views, and fixed size blobs
168
+ random.seed(100)
169
+ index = pa.array(range(16000), type=pa.int64())
170
+ col1 = pa.array([generate_blob(random.randint(10, 100)) for i in range(16000)], type=pa.binary())
171
+ col2 = pa.array([generate_blob(random.randint(10, 100)) for i in range(16000)], type=pa.large_binary())
172
+ col3 = pa.array([generate_blob(32) for i in range(16000)], type=pa.binary(32))
173
+ col4 = col1.view(pa.binary())
174
+ df = pd.DataFrame({
175
+ "index": arrowtopd(index),
176
+ "col1": arrowtopd(col1),
177
+ "col2": arrowtopd(col2),
178
+ "col3": arrowtopd(col3),
179
+ "col4": arrowtopd(col4),
180
+ }).sort_values(by=["index"])
181
+ result = conn.execute("LOAD FROM df RETURN * ORDER BY index").get_as_df()
182
+ for colname in ["col1", "col2", "col3", "col4"]:
183
+ for expected, actual in zip(df[colname], result[colname]):
184
+ if is_null(expected) or is_null(actual):
185
+ assert is_null(expected)
186
+ assert is_null(actual)
187
+ else:
188
+ if bytes(expected) != bytes(actual):
189
+ print(expected)
190
+ print(actual)
191
+ print(df[colname])
192
+ print(result[colname])
193
+ print(colname)
194
+ assert bytes(expected) == bytes(actual)
195
+
196
+
197
+ def generate_string(length):
198
+ if random.randint(0, 5) == 0:
199
+ return None
200
+ return "".join([random.choice("1234567890-=qwertyuiop[]\\asdfghjkl;'zxcvbnm,./") for i in range(length)])
201
+
202
+
203
+ def test_pyarrow_string(conn_db_readonly: ConnDB) -> None:
204
+ conn, _ = conn_db_readonly
205
+ # blobs, blob views, and fixed size blobs
206
+ random.seed(100)
207
+ index = pa.array(range(16000), type=pa.int64())
208
+ col1 = pa.array([generate_string(random.randint(10, 100)) for i in range(16000)], type=pa.string())
209
+ col2 = pa.array([generate_string(random.randint(10, 100)) for i in range(16000)], type=pa.large_string())
210
+ col3 = col1.view(pa.string())
211
+ df = pd.DataFrame({
212
+ "index": arrowtopd(index),
213
+ "col1": arrowtopd(col1),
214
+ "col2": arrowtopd(col2),
215
+ "col3": arrowtopd(col3),
216
+ }).sort_values(by=["index"])
217
+ result = conn.execute("LOAD FROM df RETURN * ORDER BY index").get_as_df()
218
+ for colname in ["col1", "col2", "col3"]:
219
+ for expected, actual in zip(df[colname], result[colname]):
220
+ if is_null(expected) or is_null(actual):
221
+ assert is_null(expected)
222
+ assert is_null(actual)
223
+ else:
224
+ assert str(expected) == str(actual)
225
+
226
+
227
+ def test_pyarrow_dict(conn_db_readonly: ConnDB) -> None:
228
+ conn, _ = conn_db_readonly
229
+ random.seed(100)
230
+ index = pa.array(range(2000), type=pa.int64())
231
+ col1 = pa.array([random.randint(0, 1) for i in range(2000)], type=pa.int32()).dictionary_encode()
232
+ col2 = pa.array([random.randint(-20, 20) / 10 for i in range(2000)], type=pa.float64()).dictionary_encode()
233
+ # it seems arrow hasn't implemented dictionary encoding for nested types
234
+ # col3 = pa.array([
235
+ # [generate_string(random.randint(10, 100)) for x in range(random.randint(10, 100))]
236
+ # for i in range(3000)
237
+ # ], type=pa.list_(pa.string())).dictionary_encode()
238
+ df = pd.DataFrame({"index": arrowtopd(index), "col1": arrowtopd(col1), "col2": arrowtopd(col2)})
239
+ result = conn.execute("LOAD FROM df RETURN * ORDER BY index").get_as_df()
240
+ for colname in ["col1", "col2"]:
241
+ for expected, actual in zip(df[colname], result[colname]):
242
+ assert expected == actual
243
+
244
+
245
+ def test_pyarrow_dict_offset(conn_db_readonly: ConnDB) -> None:
246
+ conn, _ = conn_db_readonly
247
+ random.seed(100)
248
+ datalength = 4000
249
+ index = pa.array(range(datalength), type=pa.int64())
250
+ indices = pa.array([random.randint(0, 2) for _ in range(datalength)])
251
+ dictionary = pa.array([1, 2, 3, 4])
252
+ col1 = pa.DictionaryArray.from_arrays(indices, dictionary.slice(1, 3))
253
+ df = pd.DataFrame({"index": arrowtopd(index), "col1": arrowtopd(col1)})
254
+ result = conn.execute("LOAD FROM df RETURN * ORDER BY index")
255
+ idx = 0
256
+ while result.has_next():
257
+ assert idx < len(index)
258
+ nxt = result.get_next()
259
+ proc = [idx, col1[idx].as_py()]
260
+ assert proc == nxt
261
+ idx += 1
262
+
263
+ assert idx == len(index)
264
+
265
+
266
+ def test_pyarrow_list(conn_db_readonly: ConnDB) -> None:
267
+ conn, _ = conn_db_readonly
268
+ random.seed(100)
269
+ datalength = 50
270
+ childlength = 5
271
+ index = pa.array(range(datalength))
272
+ col1 = pa.array([
273
+ [generate_primitive("int32[pyarrow]") for x in range(random.randint(1, childlength))]
274
+ if random.randint(0, 5) == 0
275
+ else None
276
+ for i in range(datalength)
277
+ ])
278
+ col2 = pa.array([
279
+ [
280
+ [generate_primitive("int32[pyarrow]") for x in range(random.randint(1, childlength))]
281
+ for y in range(1, childlength)
282
+ ]
283
+ if random.randint(0, 5) == 0
284
+ else None
285
+ for i in range(datalength)
286
+ ])
287
+ df = pd.DataFrame({"index": arrowtopd(index), "col1": arrowtopd(col1), "col2": arrowtopd(col2)})
288
+ result = conn.execute("LOAD FROM df RETURN * ORDER BY index")
289
+ idx = 0
290
+ while result.has_next():
291
+ assert idx < len(index)
292
+ nxt = result.get_next()
293
+ proc = [idx, col1[idx].as_py(), col2[idx].as_py()]
294
+ assert proc == nxt
295
+ idx += 1
296
+
297
+ assert idx == len(index)
298
+
299
+
300
+ def test_pyarrow_list_offset(conn_db_readonly: ConnDB) -> None:
301
+ conn, _ = conn_db_readonly
302
+ random.seed(100)
303
+ datalength = 50
304
+ childlength = 5
305
+ index = pa.array(range(datalength))
306
+ values = pa.array([generate_primitive("int32[pyarrow]") for _ in range(datalength * childlength + 2)])
307
+ offsets = pa.array(sorted([random.randint(0, datalength * childlength + 1) for _ in range(datalength + 1)]))
308
+ mask = pa.array([random.choice([True, False]) for _ in range(datalength)])
309
+ col1 = pa.ListArray.from_arrays(values=values.slice(2, datalength * childlength), offsets=offsets, mask=mask)
310
+ df = pd.DataFrame({
311
+ "index": arrowtopd(index),
312
+ "col1": arrowtopd(col1),
313
+ })
314
+ result = conn.execute("LOAD FROM df RETURN * ORDER BY index")
315
+ idx = 0
316
+ while result.has_next():
317
+ assert idx < len(index)
318
+ nxt = result.get_next()
319
+ proc = [idx, col1[idx].as_py()]
320
+ assert proc == nxt
321
+ idx += 1
322
+
323
+ assert idx == len(index)
324
+
325
+
326
+ def test_pyarrow_fixed_list(conn_db_readonly: ConnDB) -> None:
327
+ conn, _ = conn_db_readonly
328
+ random.seed(100)
329
+ data_len = 50
330
+ child_len = 3
331
+
332
+ mask = pa.array([random.choice([True, False]) for _ in range(data_len)])
333
+ index = pa.array(range(data_len))
334
+
335
+ # fixed list of primitive
336
+ primitive_values = pa.array([generate_primitive("int32[pyarrow]") for _ in range(data_len * child_len)])
337
+ primitive_col = pa.FixedSizeListArray.from_arrays(primitive_values, list_size=child_len, mask=mask)
338
+
339
+ # fixed list of datetime
340
+ datetime_values = pa.array(
341
+ [
342
+ datetime(random.randint(1900, 2023), random.randint(1, 12), random.randint(1, 28))
343
+ for _ in range(data_len * child_len)
344
+ ],
345
+ type=pa.date32(),
346
+ )
347
+ datetime_col = pa.FixedSizeListArray.from_arrays(datetime_values, list_size=child_len, mask=mask)
348
+
349
+ # fixed list of blob
350
+ blob_values = pa.array(
351
+ [generate_blob(random.randint(10, 100)) for _ in range(data_len * child_len)], type=pa.binary()
352
+ )
353
+ blob_col = pa.FixedSizeListArray.from_arrays(blob_values, list_size=child_len, mask=mask)
354
+
355
+ # fixed list of string
356
+ string_values = pa.array(
357
+ [generate_string(random.randint(10, 100)) for _ in range(data_len * child_len)], type=pa.string()
358
+ )
359
+ string_col = pa.FixedSizeListArray.from_arrays(string_values, list_size=child_len, mask=mask)
360
+
361
+ # fixed list of dict
362
+ dict_values = pa.array(
363
+ [random.randint(0, 1) for _ in range(data_len * child_len)], type=pa.int32()
364
+ ).dictionary_encode()
365
+ dict_col = pa.FixedSizeListArray.from_arrays(dict_values, list_size=child_len, mask=mask)
366
+
367
+ # fixed list of list
368
+ list_values = pa.array([
369
+ [generate_primitive("int32[pyarrow]") for _ in range(random.randint(1, 5))]
370
+ if random.randint(0, 5) != 0
371
+ else None
372
+ for x in range(data_len * child_len)
373
+ ])
374
+ list_col = pa.FixedSizeListArray.from_arrays(list_values, list_size=child_len, mask=mask)
375
+
376
+ # fixed list of fixed list
377
+ fixed_list_col = pa.FixedSizeListArray.from_arrays(primitive_col, list_size=1, mask=mask)
378
+
379
+ # fixed lisr of struct
380
+ struct_plaindata = [
381
+ {
382
+ "a": generate_primitive("int32[pyarrow]"),
383
+ "b": {"c": generate_string(10)} if random.randint(0, 5) != 0 else None,
384
+ }
385
+ if random.randint(0, 5) != 0
386
+ else None
387
+ for _ in range(data_len * child_len)
388
+ ]
389
+ struct_values = pa.array(struct_plaindata, pa.struct([("a", pa.int32()), ("b", pa.struct([("c", pa.string())]))]))
390
+ struct_col = pa.FixedSizeListArray.from_arrays(struct_values, list_size=child_len, mask=mask)
391
+
392
+ # fixed list of map
393
+ keySet = range(10)
394
+ valueSet = "abcdefghijklmnopqrstuvwxyz"
395
+ map_values = pa.array(
396
+ [
397
+ {
398
+ str(key): "".join(random.sample(valueSet, random.randint(0, len(valueSet))))
399
+ for key in random.sample(keySet, random.randint(1, len(keySet)))
400
+ }
401
+ if random.randint(0, 5) != 0
402
+ else None
403
+ for i in range(data_len * child_len)
404
+ ],
405
+ type=pa.map_(pa.string(), pa.string()),
406
+ )
407
+ map_col = pa.FixedSizeListArray.from_arrays(map_values, list_size=child_len, mask=mask)
408
+
409
+ df = pd.DataFrame({
410
+ "index": arrowtopd(index),
411
+ "primitive_col": arrowtopd(primitive_col),
412
+ "datetime_col": arrowtopd(datetime_col),
413
+ "blob_col": arrowtopd(blob_col),
414
+ "string_col": arrowtopd(string_col),
415
+ "dict_col": arrowtopd(dict_col),
416
+ "list_col": arrowtopd(list_col),
417
+ "fixed_list_col": arrowtopd(fixed_list_col),
418
+ "struct_col": arrowtopd(struct_col),
419
+ "map_col": arrowtopd(map_col),
420
+ })
421
+ result = conn.execute("LOAD FROM df RETURN * ORDER BY index")
422
+
423
+ idx = 0
424
+ while result.has_next():
425
+ assert idx < len(index)
426
+ nxt = result.get_next()
427
+ proc = [
428
+ idx,
429
+ primitive_col[idx].as_py(),
430
+ datetime_col[idx].as_py(),
431
+ blob_col[idx].as_py(),
432
+ string_col[idx].as_py(),
433
+ dict_col[idx].as_py(),
434
+ list_col[idx].as_py(),
435
+ fixed_list_col[idx].as_py(),
436
+ struct_col[idx].as_py(),
437
+ None
438
+ if map_col[idx].as_py() is None
439
+ else [
440
+ None if map_col[idx][i].as_py() is None else dict(map_col[idx][i].as_py()) for i in range(child_len)
441
+ ],
442
+ ]
443
+ assert proc == nxt
444
+ idx += 1
445
+
446
+ assert idx == len(index)
447
+
448
+
449
+ def test_pyarrow_fixed_list_offset(conn_db_readonly: ConnDB) -> None:
450
+ conn, _ = conn_db_readonly
451
+ random.seed(100)
452
+ data_len = 50
453
+ child_len = 5
454
+ values = pa.array([generate_primitive("int32[pyarrow]") for _ in range(data_len * child_len + 2)])
455
+ mask = pa.array([random.choice([True, False]) for _ in range(data_len)])
456
+ index = pa.array(range(data_len))
457
+ col1 = pa.FixedSizeListArray.from_arrays(values.slice(2, data_len * child_len), list_size=child_len)
458
+ col1 = col1.slice(1, 49)
459
+ col2 = pa.FixedSizeListArray.from_arrays(values.slice(1, data_len * child_len), list_size=child_len, mask=mask)
460
+ col2 = col2.slice(1, 49)
461
+ df = pd.DataFrame({"index": arrowtopd(index.slice(0, 49)), "col1": arrowtopd(col1), "col2": arrowtopd(col2)})
462
+ result = conn.execute("LOAD FROM df RETURN * ORDER BY index")
463
+ idx = 0
464
+ while result.has_next():
465
+ assert idx < len(index)
466
+ nxt = result.get_next()
467
+ proc = [idx, col1[idx].as_py(), col2[idx].as_py()]
468
+ assert proc == nxt
469
+ idx += 1
470
+
471
+ assert idx == 49
472
+
473
+
474
+ def test_pyarrow_struct(conn_db_readonly: ConnDB) -> None:
475
+ conn, _ = conn_db_readonly
476
+ random.seed(100)
477
+ datalength = 4096
478
+ index = pa.array(range(datalength))
479
+ col1_plaindata = [
480
+ {
481
+ "a": generate_primitive("int32[pyarrow]"),
482
+ "b": {"c": generate_string(10)} if random.randint(0, 5) != 0 else None,
483
+ }
484
+ if random.randint(0, 5) != 0
485
+ else None
486
+ for i in range(datalength)
487
+ ]
488
+ col1 = pa.array(col1_plaindata, pa.struct([("a", pa.int32()), ("b", pa.struct([("c", pa.string())]))]))
489
+ df = pd.DataFrame({"index": arrowtopd(index), "col1": arrowtopd(col1)})
490
+ result = conn.execute("LOAD FROM df RETURN * ORDER BY index")
491
+ idx = 0
492
+ while result.has_next():
493
+ assert idx < len(index)
494
+ nxt = result.get_next()
495
+ expected = [idx, col1[idx].as_py()]
496
+ assert expected == nxt
497
+ idx += 1
498
+
499
+ assert idx == len(index)
500
+
501
+
502
+ def test_pyarrow_struct_offset(conn_db_readonly: ConnDB) -> None:
503
+ conn, _ = conn_db_readonly
504
+ random.seed(100)
505
+ datalength = 4096
506
+ index = pa.array(range(datalength))
507
+ val1 = pa.array([generate_primitive("int32[pyarrow]") for _ in range(datalength + 1)])
508
+ val2 = pa.array([generate_primitive("bool[pyarrow]") for _ in range(datalength + 2)])
509
+ val3 = pa.array([generate_string(random.randint(5, 10)) for _ in range(datalength + 3)])
510
+ mask = pa.array([random.choice([True, False]) for _ in range(datalength)])
511
+ col1 = pa.StructArray.from_arrays(
512
+ [val1.slice(1, datalength), val2.slice(2, datalength), val3.slice(3, datalength)],
513
+ names=["a", "b", "c"],
514
+ mask=mask,
515
+ )
516
+ df = pd.DataFrame({"index": arrowtopd(index), "col1": arrowtopd(col1)})
517
+ result = conn.execute("LOAD FROM df RETURN * ORDER BY index")
518
+ idx = 0
519
+ while result.has_next():
520
+ assert idx < len(index)
521
+ nxt = result.get_next()
522
+ expected = [idx, col1[idx].as_py()]
523
+ assert expected == nxt
524
+ idx += 1
525
+
526
+ assert idx == len(index)
527
+
528
+
529
+ def test_pyarrow_union_sparse(conn_db_readonly: ConnDB) -> None:
530
+ conn, _ = conn_db_readonly
531
+ random.seed(100)
532
+ datalength = 4096
533
+ index = pa.array(range(datalength))
534
+ type_codes = pa.array([random.randint(0, 2) for i in range(datalength)], type=pa.int8())
535
+ arr1 = pa.array([generate_primitive("int32[pyarrow]") for i in range(datalength + 1)], type=pa.int32())
536
+ arr2 = pa.array([generate_string(random.randint(1, 10)) for i in range(datalength + 2)])
537
+ arr3 = pa.array([generate_primitive("float32[pyarrow]") for j in range(datalength + 3)])
538
+ col1 = pa.UnionArray.from_sparse(
539
+ type_codes, [arr1.slice(1, datalength), arr2.slice(2, datalength), arr3.slice(3, datalength)]
540
+ )
541
+ df = pd.DataFrame({"index": arrowtopd(index), "col1": arrowtopd(col1)})
542
+ result = conn.execute("LOAD FROM df RETURN * ORDER BY index")
543
+ idx = 0
544
+ while result.has_next():
545
+ assert idx < len(index)
546
+ nxt = result.get_next()
547
+ expected = [idx, col1[idx].as_py()]
548
+ assert expected == nxt or (is_null(nxt[1]) and is_null(expected[1]))
549
+ idx += 1
550
+
551
+ assert idx == len(index)
552
+
553
+
554
+ def test_pyarrow_union_dense(conn_db_readonly: ConnDB) -> None:
555
+ conn, _ = conn_db_readonly
556
+ random.seed(100)
557
+ datalength = 4096
558
+ index = pa.array(range(datalength))
559
+ type_codes = [random.randint(0, 2) for i in range(datalength)]
560
+ offsets = [0 for _ in range(datalength)]
561
+ cnt = [0, 0, 0]
562
+ for i in range(len(type_codes)):
563
+ offsets[i] = cnt[type_codes[i]]
564
+ cnt[type_codes[i]] += 1
565
+ offsets = pa.array(offsets, type=pa.int32())
566
+ arr1 = pa.array([generate_primitive("int32[pyarrow]") for i in range(datalength + 1)], type=pa.int32())
567
+ arr2 = pa.array([generate_string(random.randint(1, 10)) for i in range(datalength + 2)])
568
+ arr3 = pa.array([generate_primitive("float32[pyarrow]") for j in range(datalength + 3)])
569
+ col1 = pa.UnionArray.from_dense(
570
+ pa.array(type_codes, type=pa.int8()),
571
+ offsets,
572
+ [arr1.slice(1, datalength), arr2.slice(2, datalength), arr3.slice(3, datalength)],
573
+ )
574
+ df = pd.DataFrame({"index": arrowtopd(index), "col1": arrowtopd(col1)})
575
+ result = conn.execute("LOAD FROM df RETURN * ORDER BY index")
576
+ idx = 0
577
+ while result.has_next():
578
+ assert idx < len(index)
579
+ nxt = result.get_next()
580
+ expected = [idx, col1[idx].as_py()]
581
+ assert expected == nxt or (is_null(nxt[1]) and is_null(expected[1]))
582
+ idx += 1
583
+
584
+ assert idx == len(index)
585
+
586
+
587
+ def test_pyarrow_map(conn_db_readonly: ConnDB) -> None:
588
+ conn, _ = conn_db_readonly
589
+ random.seed(100)
590
+ datalength = 4096
591
+ index = pa.array(range(datalength))
592
+ keySet = range(100)
593
+ valueSet = "abcdefghijklmnopqrstuvwxyz"
594
+ col1 = pa.array(
595
+ [
596
+ {
597
+ str(key): "".join(random.sample(valueSet, random.randint(0, len(valueSet))))
598
+ for key in random.sample(keySet, random.randint(1, len(keySet)))
599
+ }
600
+ if random.randint(0, 5) != 0
601
+ else None
602
+ for i in range(datalength)
603
+ ],
604
+ type=pa.map_(pa.string(), pa.string()),
605
+ )
606
+ df = pd.DataFrame({"index": arrowtopd(index), "col1": arrowtopd(col1)})
607
+ result = conn.execute("LOAD FROM df RETURN * ORDER BY index")
608
+ idx = 0
609
+ while result.has_next():
610
+ assert idx < len(index)
611
+ nxt = result.get_next()
612
+ expected = [idx, None if col1[idx].as_py() is None else dict(col1[idx].as_py())]
613
+ assert expected == nxt
614
+ idx += 1
615
+
616
+ assert idx == len(index)
617
+
618
+
619
+ def test_pyarrow_map_offset(conn_db_readonly: ConnDB) -> None:
620
+ conn, _ = conn_db_readonly
621
+ random.seed(100)
622
+ datalength = 50
623
+ maplength = 5
624
+ index = pa.array(range(datalength))
625
+ offsets = sorted([random.randint(0, datalength * maplength + 1) for _ in range(datalength + 1)])
626
+ offsets[25] = None
627
+ offsets = pa.array(offsets, type=pa.int32())
628
+ keys = pa.array([random.randint(0, (1 << 31) - 1) for _ in range(datalength * maplength + 1)])
629
+ values = pa.array([generate_primitive("int64[pyarrow]") for _ in range(datalength * maplength + 1)])
630
+ col1 = pa.MapArray.from_arrays(
631
+ offsets, keys.slice(1, datalength * maplength), values.slice(1, datalength * maplength)
632
+ )
633
+ col1 = col1.slice(2, 48)
634
+ df = pd.DataFrame({
635
+ "index": arrowtopd(index.slice(0, 48)),
636
+ "col1": arrowtopd(col1),
637
+ })
638
+ result = conn.execute("LOAD FROM df RETURN * ORDER BY index")
639
+ idx = 0
640
+ while result.has_next():
641
+ assert idx < len(index)
642
+ nxt = result.get_next()
643
+ expected = [idx, None if col1[idx].as_py() is None else dict(col1[idx].as_py())]
644
+ assert expected == nxt
645
+ idx += 1
646
+
647
+ assert idx == 48
648
+
649
+
650
+ def test_pyarrow_decimal(conn_db_readwrite: ConnDB) -> None:
651
+ conn, _ = conn_db_readwrite
652
+ datalength = 4
653
+ index = pa.array(range(datalength))
654
+ decimal52 = pa.array(map(Decimal, ["1.2", "2", "0.5", "100"]), type=pa.decimal128(7, 2))
655
+ decimal380 = pa.array(
656
+ map(Decimal, ["2938103", "109283091238", "1028391238012", "1283019283123"]), type=pa.decimal128(38, 0)
657
+ )
658
+ df = pd.DataFrame({"index": arrowtopd(index), "col1": arrowtopd(decimal52), "col2": arrowtopd(decimal380)})
659
+ conn.execute("CREATE NODE TABLE tab(id INT64, col1 DECIMAL(7, 2), col2 DECIMAL(38, 0), primary key(id))")
660
+ conn.execute("LOAD FROM df CREATE (t:tab {id: index, col1: col1, col2: col2})")
661
+ result = conn.execute("MATCH (t:tab) RETURN t.id as index, t.col1 as col1, t.col2 as col2").get_as_arrow()
662
+ expected = pa.Table.from_arrays([index, decimal52, decimal380], names=["index", "col1", "col2"])
663
+ print(result)
664
+ print(expected)
665
+ assert tables_equal(result, expected)
666
+
667
+
668
+ def test_pyarrow_skip_limit(conn_db_readonly: ConnDB) -> None:
669
+ conn, _ = conn_db_readonly
670
+ datalength = 15000
671
+ random.seed(100)
672
+ index = pa.array(range(datalength))
673
+ col0 = pa.array([generate_primitive("int64[pyarrow]") for _ in range(datalength)])
674
+ col1 = pa.array([generate_string(random.randint(1, 100)) for _ in range(datalength)])
675
+ col2 = pa.array([
676
+ [generate_primitive("bool[pyarrow]") for x in range(random.randint(1, 10))] for _ in range(datalength)
677
+ ])
678
+ df = pd.DataFrame({
679
+ "index": arrowtopd(index),
680
+ "col0": arrowtopd(col0),
681
+ "col1": arrowtopd(col1),
682
+ "col2": arrowtopd(col2),
683
+ })
684
+ result = conn.execute("LOAD FROM df (SKIP=5000, LIMIT=5000) RETURN * ORDER BY index").get_as_arrow()
685
+ expected = pa.Table.from_pandas(df).slice(5000, 5000)
686
+ assert result["index"].to_pylist() == expected["index"].to_pylist()
687
+ assert result["col0"].to_pylist() == expected["col0"].to_pylist()
688
+ assert result["col1"].to_pylist() == expected["col1"].to_pylist()
689
+ assert result["col2"].to_pylist() == expected["col2"].to_pylist()
690
+
691
+ # skip bounds check
692
+ result = conn.execute("LOAD FROM df (SKIP=500000, LIMIT=5000) RETURN * ORDER BY index").get_as_arrow()
693
+ assert len(result) == 0
694
+
695
+ # limit bounds check
696
+ result = conn.execute("LOAD FROM df (SKIP=0, LIMIT=500000) RETURN * ORDER BY index").get_as_arrow()
697
+ expected = pa.Table.from_pandas(df)
698
+ assert result["index"].to_pylist() == expected["index"].to_pylist()
699
+ assert result["col0"].to_pylist() == expected["col0"].to_pylist()
700
+ assert result["col1"].to_pylist() == expected["col1"].to_pylist()
701
+ assert result["col2"].to_pylist() == expected["col2"].to_pylist()
702
+
703
+
704
+ def test_pyarrow_invalid_skip_limit(conn_db_readonly: ConnDB) -> None:
705
+ conn, _ = conn_db_readonly
706
+ df = pd.DataFrame({"col": arrowtopd(pa.array([1, 2, 3, 4, 5]))})
707
+ with pytest.raises(
708
+ RuntimeError, match=re.escape("Binder exception: SKIP Option must be a positive integer literal.")
709
+ ):
710
+ conn.execute("LOAD FROM df (skip='1') RETURN *;")
711
+ with pytest.raises(
712
+ RuntimeError, match=re.escape("Binder exception: LIMIT Option must be a positive integer literal.")
713
+ ):
714
+ conn.execute("LOAD FROM df (limit='1') RETURN *;")