kumoai 2.12.0.dev202511111731__cp311-cp311-macosx_11_0_arm64.whl → 2.13.0.dev202512091732__cp311-cp311-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. kumoai/__init__.py +18 -9
  2. kumoai/_version.py +1 -1
  3. kumoai/client/client.py +9 -13
  4. kumoai/connector/utils.py +23 -2
  5. kumoai/experimental/rfm/__init__.py +162 -46
  6. kumoai/experimental/rfm/backend/__init__.py +0 -0
  7. kumoai/experimental/rfm/backend/local/__init__.py +42 -0
  8. kumoai/experimental/rfm/{local_graph_store.py → backend/local/graph_store.py} +20 -30
  9. kumoai/experimental/rfm/backend/local/sampler.py +242 -0
  10. kumoai/experimental/rfm/backend/local/table.py +109 -0
  11. kumoai/experimental/rfm/backend/snow/__init__.py +35 -0
  12. kumoai/experimental/rfm/backend/snow/table.py +117 -0
  13. kumoai/experimental/rfm/backend/sqlite/__init__.py +30 -0
  14. kumoai/experimental/rfm/backend/sqlite/table.py +101 -0
  15. kumoai/experimental/rfm/base/__init__.py +14 -0
  16. kumoai/experimental/rfm/base/column.py +66 -0
  17. kumoai/experimental/rfm/base/sampler.py +374 -0
  18. kumoai/experimental/rfm/base/source.py +18 -0
  19. kumoai/experimental/rfm/{local_table.py → base/table.py} +139 -139
  20. kumoai/experimental/rfm/{local_graph.py → graph.py} +334 -79
  21. kumoai/experimental/rfm/infer/__init__.py +6 -0
  22. kumoai/experimental/rfm/infer/dtype.py +79 -0
  23. kumoai/experimental/rfm/infer/pkey.py +126 -0
  24. kumoai/experimental/rfm/infer/time_col.py +62 -0
  25. kumoai/experimental/rfm/local_graph_sampler.py +43 -4
  26. kumoai/experimental/rfm/local_pquery_driver.py +1 -1
  27. kumoai/experimental/rfm/pquery/pandas_executor.py +1 -1
  28. kumoai/experimental/rfm/rfm.py +28 -27
  29. kumoai/experimental/rfm/sagemaker.py +138 -0
  30. kumoai/spcs.py +1 -3
  31. kumoai/testing/decorators.py +1 -1
  32. {kumoai-2.12.0.dev202511111731.dist-info → kumoai-2.13.0.dev202512091732.dist-info}/METADATA +12 -2
  33. {kumoai-2.12.0.dev202511111731.dist-info → kumoai-2.13.0.dev202512091732.dist-info}/RECORD +36 -21
  34. kumoai/experimental/rfm/utils.py +0 -344
  35. {kumoai-2.12.0.dev202511111731.dist-info → kumoai-2.13.0.dev202512091732.dist-info}/WHEEL +0 -0
  36. {kumoai-2.12.0.dev202511111731.dist-info → kumoai-2.13.0.dev202512091732.dist-info}/licenses/LICENSE +0 -0
  37. {kumoai-2.12.0.dev202511111731.dist-info → kumoai-2.13.0.dev202512091732.dist-info}/top_level.txt +0 -0
@@ -1,33 +1,48 @@
1
1
  kumoai/_logging.py,sha256=U2_5ROdyk92P4xO4H2WJV8EC7dr6YxmmnM-b7QX9M7I,886
2
2
  kumoai/mixin.py,sha256=MP413xzuCqWhxAPUHmloLA3j4ZyF1tEtfi516b_hOXQ,812
3
- kumoai/_version.py,sha256=EmBJ4U0JvENPiq7lq8M80mpSdMDFEwNkBsjWDdzaLT4,39
4
- kumoai/__init__.py,sha256=LU1zmKYc0KV5hy2VGKUuXgSvbJwj2rSRQ_R_bpHyl1o,10708
3
+ kumoai/_version.py,sha256=bUx8YEaVu-Ejr0CkVpysUTjmuNl9FBowBo1W0BvlWVo,39
4
+ kumoai/__init__.py,sha256=Nn9YH_x9kAeEFn8RWbP95slZow0qFnakPZZ1WADe1hY,10843
5
5
  kumoai/formatting.py,sha256=jA_rLDCGKZI8WWCha-vtuLenVKTZvli99Tqpurz1H84,953
6
6
  kumoai/futures.py,sha256=oJFIfdCM_3nWIqQteBKYMY4fPhoYlYWE_JA2o6tx-ng,3737
7
7
  kumoai/kumolib.cpython-311-darwin.so,sha256=AmB_Fysmud1y7Gm5CuBQ5lWDuSzpxVDV_iTA2cjH1s8,232544
8
8
  kumoai/jobs.py,sha256=NrdLEFNo7oeCYSy-kj2nAvCFrz9BZ_xrhkqHFHk5ksY,2496
9
9
  kumoai/exceptions.py,sha256=b-_sdbAKOg50uaJZ65GmBLdTo4HANdjl8_R0sJpwaN0,833
10
10
  kumoai/databricks.py,sha256=e6E4lOFvZHXFwh4CO1kXU1zzDU3AapLQYMxjiHPC-HQ,476
11
- kumoai/spcs.py,sha256=N4ddeoHAc4I3bKrDitsb91lUx5VKvCyPyMT3zWiuCcY,4275
11
+ kumoai/spcs.py,sha256=N31d7rLa-bgYh8e2J4YzX1ScxGLqiVXrqJnCl1y4Mts,4139
12
12
  kumoai/_singleton.py,sha256=UTwrbDkoZSGB8ZelorvprPDDv9uZkUi1q_SrmsyngpQ,836
13
13
  kumoai/experimental/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
- kumoai/experimental/rfm/local_graph_sampler.py,sha256=o60_sdMa_fr60DrdmCIaE6lKQAD2msp1t-GGubFNt-o,6738
15
- kumoai/experimental/rfm/local_graph.py,sha256=2iJDlsGVzqCe1bD_puXWlhwGkn7YnQyJ4p4C-fwCZNE,30076
16
- kumoai/experimental/rfm/local_pquery_driver.py,sha256=aO7Jfwx9gxGKYvpqxZx1LLWdI1MhuZQOPtAITxoOQO0,26162
17
- kumoai/experimental/rfm/__init__.py,sha256=ornmi2x947jkQLptMn7ZLvTf2Sw-RMcVW73AnjVsWAo,1709
18
- kumoai/experimental/rfm/utils.py,sha256=3IiBvT_aLBkkcJh3H11_50yt_XlEzHR0cm9Kprrtl8k,11123
19
- kumoai/experimental/rfm/local_table.py,sha256=r8xZ33Mjs6JD8ud6h23tZ99Dag2DvZ4h6tWjmGrKQg4,19605
20
- kumoai/experimental/rfm/rfm.py,sha256=V2NxxhrYi_MqLi_xcZsOYsdciT7V44iS5Fc9Ewq9eiM,48101
21
- kumoai/experimental/rfm/local_graph_store.py,sha256=8BqonuaMftAAsjgZpB369i5AeNd1PkisMbbEqc0cKBo,13847
14
+ kumoai/experimental/rfm/local_graph_sampler.py,sha256=32ZCNirPyCqCD8IccaXmRt0EJk1p54mWXpJ33NotAqE,7883
15
+ kumoai/experimental/rfm/local_pquery_driver.py,sha256=dhOS1L9aboya86EL4AFYc8bQkimbOchSLfe_jn2qGh4,26158
16
+ kumoai/experimental/rfm/graph.py,sha256=76hlQyaEYqBYNIF3jslIqRRuAPNtXvc1kR6InwyHH-M,39751
17
+ kumoai/experimental/rfm/__init__.py,sha256=slliYcrh80xPtQQ_nnsp3ny9IbmHCyirmdZUfKTdME4,6064
18
+ kumoai/experimental/rfm/sagemaker.py,sha256=_hTrFg4qfXe7uzwqSEG_wze-IFkwn7qde9OpUodCpbc,4982
19
+ kumoai/experimental/rfm/rfm.py,sha256=BSgxeM0xW2mt74jq4Ah4hl85RxT6337NoDQP7f7iXvY,47699
22
20
  kumoai/experimental/rfm/authenticate.py,sha256=FiuHMvP7V3zBZUlHMDMbNLhc-UgDZgz4hjVSTuQ7DRw,18888
21
+ kumoai/experimental/rfm/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
+ kumoai/experimental/rfm/backend/sqlite/__init__.py,sha256=jYmZDNAVsojuPO1Q5idFmG5N0aCB8BDyrpAoS31n9bc,844
23
+ kumoai/experimental/rfm/backend/sqlite/table.py,sha256=kcYpWaZKFez2Tru6Sdz-Ywk8jP8VpLnjmCIQQtRFGnU,3800
24
+ kumoai/experimental/rfm/backend/local/__init__.py,sha256=2s9sSA-E-8pfkkzCH4XPuaSxSznEURMfMgwEIfYYPsg,1014
25
+ kumoai/experimental/rfm/backend/local/table.py,sha256=Ahob9HidpU6z_M41rK5FATa3d7CL2UzZl8pGVyrzLNc,3565
26
+ kumoai/experimental/rfm/backend/local/graph_store.py,sha256=RpfJldemOG-4RzGSIS9EcytHbvC4gYm-Ps3a-4qfptk,13297
27
+ kumoai/experimental/rfm/backend/local/sampler.py,sha256=xJf53jD8kJCQa6gZTmxp9AXsOXwPoQ7G7dAefXGX16E,8459
28
+ kumoai/experimental/rfm/backend/snow/__init__.py,sha256=B-tG-p8WA-mBuwvK1f0S2gdRPEGwApdxlnyeVSnY2xg,927
29
+ kumoai/experimental/rfm/backend/snow/table.py,sha256=sHagXhW7RifzOiB4yjxV_9FtR0KUFVIw1mYwZe4bpMg,4255
23
30
  kumoai/experimental/rfm/pquery/__init__.py,sha256=X0O3EIq5SMfBEE-ii5Cq6iDhR3s3XMXB52Cx5htoePw,152
24
- kumoai/experimental/rfm/pquery/pandas_executor.py,sha256=kiBJq7uVGbasG7TiqsubEl6ey3UYzZiM4bwxILqp_54,18487
31
+ kumoai/experimental/rfm/pquery/pandas_executor.py,sha256=wYI9a3smClR2pQGwsYRdmpOm0PlUsbtyW9wpAVpCEe4,18492
25
32
  kumoai/experimental/rfm/pquery/executor.py,sha256=f7-pJhL0BgFU9E4o4gQpQyArOvyrZtwxFmks34-QOAE,2741
26
33
  kumoai/experimental/rfm/infer/multicategorical.py,sha256=0-cLpDnGryhr76QhZNO-klKokJ6MUSfxXcGdQ61oykY,1102
27
34
  kumoai/experimental/rfm/infer/categorical.py,sha256=VwNaKwKbRYkTxEJ1R6gziffC8dGsEThcDEfbi-KqW5c,853
35
+ kumoai/experimental/rfm/infer/time_col.py,sha256=7R5Itl8RRBOr61qLpRTanIqrUVZFZcAXzDA9lCw4nx4,1820
36
+ kumoai/experimental/rfm/infer/pkey.py,sha256=ubNqW1LIjLKiXbjXELAY3g6n2f3u2Eis_uC2DEiXFiU,4393
28
37
  kumoai/experimental/rfm/infer/id.py,sha256=ZIO0DWIoiEoS_8MVc5lkqBfkTWWQ0yGCgjkwLdaYa_Q,908
29
- kumoai/experimental/rfm/infer/__init__.py,sha256=xQ8_SuejIzXyn2J7bIKX3pXumFtRuEfBtE5oEDUDJjI,293
38
+ kumoai/experimental/rfm/infer/dtype.py,sha256=ZZ6ztqJnTR1CaC2z5Uhf0o0rSdNThnss5tem5JNQkck,2607
39
+ kumoai/experimental/rfm/infer/__init__.py,sha256=krdMFN8iKZlSFOl-M5MW1KuSviQV3H1E18jj2uB8g6Q,469
30
40
  kumoai/experimental/rfm/infer/timestamp.py,sha256=vM9--7eStzaGG13Y-oLYlpNJyhL6f9dp17HDXwtl_DM,1094
41
+ kumoai/experimental/rfm/base/__init__.py,sha256=V2B2TA064nppZ3o6uWAH7EFeKSz-hZKB7_YqV1jJfOI,303
42
+ kumoai/experimental/rfm/base/table.py,sha256=yaY7Auvq2KblXOid3-a_Pw6RgnPK5Y1zGAY2xi1D2gg,19843
43
+ kumoai/experimental/rfm/base/sampler.py,sha256=2FIUIN2fD0RCz-qx1NCuEpt3YRx7nxcQds5lEao4mq4,13433
44
+ kumoai/experimental/rfm/base/source.py,sha256=8_waFQVsctryHkm9BwmFZ9-vw5cXAXfjk7KDmcl_kic,272
45
+ kumoai/experimental/rfm/base/column.py,sha256=izCJmufJcd1RSi-ptFMfrue-JYag38MJxizka7ya0-A,2319
31
46
  kumoai/encoder/__init__.py,sha256=VPGs4miBC_WfwWeOXeHhFomOUocERFavhKf5fqITcds,182
32
47
  kumoai/graph/graph.py,sha256=iyp4klPIMn2ttuEqMJvsrxKb_tmz_DTnvziIhCegduM,38291
33
48
  kumoai/graph/__init__.py,sha256=n8X4X8luox4hPBHTRC9R-3JzvYYMoR8n7lF1H4w4Hzc,228
@@ -57,7 +72,7 @@ kumoai/codegen/handlers/utils.py,sha256=58b2GCgaTBUp2aId7BLMXMV0ENrusbNbfw7mlyXA
57
72
  kumoai/codegen/handlers/connector.py,sha256=afGf_GreyQ9y6qF3QTgSiM416qtUcP298SatNqUFhvQ,3828
58
73
  kumoai/codegen/handlers/table.py,sha256=POHpA-GFYFGTSuerGmtigYablk-Wq1L3EBvsOI-iFMQ,3956
59
74
  kumoai/testing/__init__.py,sha256=goHIIo3JE7uHV7njo4_aTd89mVVR74BEAZ2uyBaOR0w,170
60
- kumoai/testing/decorators.py,sha256=RiFrJcP-ym-mB1BYSGC26bBiryxoR9-GwL1G4EHc2sc,1591
75
+ kumoai/testing/decorators.py,sha256=83tMifuPTpUqX7zHxMttkj1TDdB62EBtAP-Fjj72Zdo,1607
61
76
  kumoai/connector/glue_connector.py,sha256=HivT0QYQ8-XeB4QLgWvghiqXuq7jyBK9G2R1py_NnE4,4697
62
77
  kumoai/connector/databricks_connector.py,sha256=YQy203XHZGzNJ8bPUjUOnrVt2KlpgMdVuTHpc6sVCcs,7574
63
78
  kumoai/connector/snowflake_connector.py,sha256=K0s-H9tW3rve8g2x1PbyxvzSpkROfGQZz-Qa4PoT4UE,9022
@@ -65,7 +80,7 @@ kumoai/connector/bigquery_connector.py,sha256=IkyRqvF8Cg96kApUuuz86eYnl-BqBmDX1f
65
80
  kumoai/connector/source_table.py,sha256=QLT8bEYaxeMwy-b168url0VfnkTrs5K6VKLbxTI4hEY,17539
66
81
  kumoai/connector/__init__.py,sha256=9g6oNJ0qHWFlL5enTSoK4_SSH_5hP74xUDZx-9SggC4,842
67
82
  kumoai/connector/file_upload_connector.py,sha256=swp03HgChOvmNPJetuujBSAqADe7NRmS_T0F3o9it4w,7008
68
- kumoai/connector/utils.py,sha256=PUjunLpfqMZsrPDo2EmnyJRBl_mt-E6ugv2kNkf5Rn8,64011
83
+ kumoai/connector/utils.py,sha256=wlqQxMmPvnFNoCcczGkKYjSu05h8OhWh4fhTzQm_2bQ,64694
69
84
  kumoai/connector/s3_connector.py,sha256=3kbv-h7DwD8O260Q0h1GPm5wwQpLt-Tb3d_CBSaie44,10155
70
85
  kumoai/connector/base.py,sha256=cujXSZF3zAfuxNuEw54DSL1T7XCuR4t0shSMDuPUagQ,5291
71
86
  kumoai/pquery/__init__.py,sha256=uTXr7t1eXcVfM-ETaM_1ImfEqhrmaj8BjiIvy1YZTL8,533
@@ -73,7 +88,7 @@ kumoai/pquery/predictive_query.py,sha256=oUqwdOWLLkPM-G4PhpUk_6mwSJGBtaD3t37Wp5O
73
88
  kumoai/pquery/prediction_table.py,sha256=QPDH22X1UB0NIufY7qGuV2XW7brG3Pv--FbjNezzM2g,10776
74
89
  kumoai/pquery/training_table.py,sha256=elmPDZx11kPiC_dkOhJcBUGtHKgL32GCBvZ9k6U0pMg,15809
75
90
  kumoai/client/pquery.py,sha256=R2hc-M8vPoyIDH0ywLwFVxCznVAqpZz3w2HszjdNW-o,6891
76
- kumoai/client/client.py,sha256=S1OfGDwTzoyf40fhg111xGQGNfEP-OnoXqFV6X9iMEc,8580
91
+ kumoai/client/client.py,sha256=Jda8V9yiu3LbhxlcgRWPeYi7eF6jzCKcq8-B_vEd1ik,8514
77
92
  kumoai/client/graph.py,sha256=zvLEDExLT_RVbUMHqVl0m6tO6s2gXmYSoWmPF6YMlnA,3831
78
93
  kumoai/client/online.py,sha256=pkBBh_DEC3GAnPcNw6bopNRlGe7EUbIFe7_seQqZRaw,2720
79
94
  kumoai/client/source_table.py,sha256=VCsCcM7KYcnjGP7HLTb-AOSEGEVsJTWjk8bMg1JdgPU,2101
@@ -91,8 +106,8 @@ kumoai/trainer/baseline_trainer.py,sha256=LlfViNOmswNv4c6zJJLsyv0pC2mM2WKMGYx06o
91
106
  kumoai/trainer/__init__.py,sha256=zUdFl-f-sBWmm2x8R-rdVzPBeU2FaMzUY5mkcgoTa1k,939
92
107
  kumoai/trainer/online_serving.py,sha256=9cddb5paeZaCgbUeceQdAOxysCtV5XP-KcsgFz_XR5w,9566
93
108
  kumoai/trainer/trainer.py,sha256=hBXO7gwpo3t59zKFTeIkK65B8QRmWCwO33sbDuEAPlY,20133
94
- kumoai-2.12.0.dev202511111731.dist-info/RECORD,,
95
- kumoai-2.12.0.dev202511111731.dist-info/WHEEL,sha256=sunMa2yiYbrNLGeMVDqEA0ayyJbHlex7SCn1TZrEq60,136
96
- kumoai-2.12.0.dev202511111731.dist-info/top_level.txt,sha256=YjU6UcmomoDx30vEXLsOU784ED7VztQOsFApk1SFwvs,7
97
- kumoai-2.12.0.dev202511111731.dist-info/METADATA,sha256=sNoIEIZxJx58O-0mQyfBmpsnrkAzg3ZVQhucsvlDX64,2052
98
- kumoai-2.12.0.dev202511111731.dist-info/licenses/LICENSE,sha256=TbWlyqRmhq9PEzCaTI0H0nWLQCCOywQM8wYH8MbjfLo,1102
109
+ kumoai-2.13.0.dev202512091732.dist-info/RECORD,,
110
+ kumoai-2.13.0.dev202512091732.dist-info/WHEEL,sha256=sunMa2yiYbrNLGeMVDqEA0ayyJbHlex7SCn1TZrEq60,136
111
+ kumoai-2.13.0.dev202512091732.dist-info/top_level.txt,sha256=YjU6UcmomoDx30vEXLsOU784ED7VztQOsFApk1SFwvs,7
112
+ kumoai-2.13.0.dev202512091732.dist-info/METADATA,sha256=vJw5NmUoOgDLJFHGcXgjPq6lYJXtSn8wvhyOKnCsaVU,2510
113
+ kumoai-2.13.0.dev202512091732.dist-info/licenses/LICENSE,sha256=TbWlyqRmhq9PEzCaTI0H0nWLQCCOywQM8wYH8MbjfLo,1102
@@ -1,344 +0,0 @@
1
- import re
2
- import warnings
3
- from typing import Any, Dict, Optional
4
-
5
- import numpy as np
6
- import pandas as pd
7
- import pyarrow as pa
8
- from kumoapi.typing import Dtype, Stype
9
-
10
- from kumoai.experimental.rfm.infer import (
11
- contains_categorical,
12
- contains_id,
13
- contains_multicategorical,
14
- contains_timestamp,
15
- )
16
-
17
- # Mapping from pandas/numpy dtypes to Kumo Dtypes
18
- PANDAS_TO_DTYPE: Dict[Any, Dtype] = {
19
- np.dtype('bool'): Dtype.bool,
20
- pd.BooleanDtype(): Dtype.bool,
21
- pa.bool_(): Dtype.bool,
22
- np.dtype('byte'): Dtype.int,
23
- pd.UInt8Dtype(): Dtype.int,
24
- np.dtype('int16'): Dtype.int,
25
- pd.Int16Dtype(): Dtype.int,
26
- np.dtype('int32'): Dtype.int,
27
- pd.Int32Dtype(): Dtype.int,
28
- np.dtype('int64'): Dtype.int,
29
- pd.Int64Dtype(): Dtype.int,
30
- np.dtype('float32'): Dtype.float,
31
- pd.Float32Dtype(): Dtype.float,
32
- np.dtype('float64'): Dtype.float,
33
- pd.Float64Dtype(): Dtype.float,
34
- np.dtype('object'): Dtype.string,
35
- pd.StringDtype(storage='python'): Dtype.string,
36
- pd.StringDtype(storage='pyarrow'): Dtype.string,
37
- pa.string(): Dtype.string,
38
- pa.binary(): Dtype.binary,
39
- np.dtype('datetime64[ns]'): Dtype.date,
40
- np.dtype('timedelta64[ns]'): Dtype.timedelta,
41
- pa.list_(pa.float32()): Dtype.floatlist,
42
- pa.list_(pa.int64()): Dtype.intlist,
43
- pa.list_(pa.string()): Dtype.stringlist,
44
- }
45
-
46
-
47
- def to_dtype(ser: pd.Series) -> Dtype:
48
- """Extracts the :class:`Dtype` from a :class:`pandas.Series`.
49
-
50
- Args:
51
- ser: A :class:`pandas.Series` to analyze.
52
-
53
- Returns:
54
- The data type.
55
- """
56
- if pd.api.types.is_datetime64_any_dtype(ser.dtype):
57
- return Dtype.date
58
-
59
- if isinstance(ser.dtype, pd.CategoricalDtype):
60
- return Dtype.string
61
-
62
- if pd.api.types.is_object_dtype(ser.dtype):
63
- index = ser.iloc[:1000].first_valid_index()
64
- if index is not None and pd.api.types.is_list_like(ser[index]):
65
- pos = ser.index.get_loc(index)
66
- assert isinstance(pos, int)
67
- ser = ser.iloc[pos:pos + 1000].dropna()
68
-
69
- if not ser.map(pd.api.types.is_list_like).all():
70
- raise ValueError("Data contains a mix of list-like and "
71
- "non-list-like values")
72
-
73
- ser = ser[ser.map(lambda x: not isinstance(x, list) or len(x) > 0)]
74
-
75
- dtypes = ser.apply(lambda x: PANDAS_TO_DTYPE.get(
76
- np.array(x).dtype, Dtype.string)).unique().tolist()
77
-
78
- invalid_dtypes = set(dtypes) - {
79
- Dtype.string,
80
- Dtype.int,
81
- Dtype.float,
82
- }
83
- if len(invalid_dtypes) > 0:
84
- raise ValueError(f"Data contains unsupported list data types: "
85
- f"{list(invalid_dtypes)}")
86
-
87
- if Dtype.string in dtypes:
88
- return Dtype.stringlist
89
-
90
- if dtypes == [Dtype.int]:
91
- return Dtype.intlist
92
-
93
- return Dtype.floatlist
94
-
95
- if ser.dtype not in PANDAS_TO_DTYPE:
96
- raise ValueError(f"Unsupported data type '{ser.dtype}'")
97
-
98
- return PANDAS_TO_DTYPE[ser.dtype]
99
-
100
-
101
- def infer_stype(ser: pd.Series, column_name: str, dtype: Dtype) -> Stype:
102
- r"""Infers the semantic type of a column.
103
-
104
- Args:
105
- ser: A :class:`pandas.Series` to analyze.
106
- column_name: The name of the column (used for pattern matching).
107
- dtype: The data type.
108
-
109
- Returns:
110
- The semantic type.
111
- """
112
- if contains_id(ser, column_name, dtype):
113
- return Stype.ID
114
-
115
- if contains_timestamp(ser, column_name, dtype):
116
- return Stype.timestamp
117
-
118
- if contains_multicategorical(ser, column_name, dtype):
119
- return Stype.multicategorical
120
-
121
- if contains_categorical(ser, column_name, dtype):
122
- return Stype.categorical
123
-
124
- return dtype.default_stype
125
-
126
-
127
- def detect_primary_key(
128
- table_name: str,
129
- df: pd.DataFrame,
130
- candidates: list[str],
131
- ) -> Optional[str]:
132
- r"""Auto-detect potential primary key column.
133
-
134
- Args:
135
- table_name: The table name.
136
- df: The pandas DataFrame to analyze
137
- candidates: A list of potential candidates.
138
-
139
- Returns:
140
- The name of the detected primary key, or ``None`` if not found.
141
- """
142
- # A list of (potentially modified) table names that are eligible to match
143
- # with a primary key, i.e.:
144
- # - UserInfo -> User
145
- # - snakecase <-> camelcase
146
- # - camelcase <-> snakecase
147
- # - plural <-> singular (users -> user, eligibilities -> eligibility)
148
- # - verb -> noun (qualifying -> qualify)
149
- _table_names = {table_name}
150
- if table_name.lower().endswith('_info'):
151
- _table_names.add(table_name[:-5])
152
- elif table_name.lower().endswith('info'):
153
- _table_names.add(table_name[:-4])
154
-
155
- table_names = set()
156
- for _table_name in _table_names:
157
- table_names.add(_table_name.lower())
158
- snakecase = re.sub(r'(.)([A-Z][a-z]+)', r'\1_\2', _table_name)
159
- snakecase = re.sub(r'([a-z0-9])([A-Z])', r'\1_\2', snakecase)
160
- table_names.add(snakecase.lower())
161
- camelcase = _table_name.replace('_', '')
162
- table_names.add(camelcase.lower())
163
- if _table_name.lower().endswith('s'):
164
- table_names.add(_table_name.lower()[:-1])
165
- table_names.add(snakecase.lower()[:-1])
166
- table_names.add(camelcase.lower()[:-1])
167
- else:
168
- table_names.add(_table_name.lower() + 's')
169
- table_names.add(snakecase.lower() + 's')
170
- table_names.add(camelcase.lower() + 's')
171
- if _table_name.lower().endswith('ies'):
172
- table_names.add(_table_name.lower()[:-3] + 'y')
173
- table_names.add(snakecase.lower()[:-3] + 'y')
174
- table_names.add(camelcase.lower()[:-3] + 'y')
175
- elif _table_name.lower().endswith('y'):
176
- table_names.add(_table_name.lower()[:-1] + 'ies')
177
- table_names.add(snakecase.lower()[:-1] + 'ies')
178
- table_names.add(camelcase.lower()[:-1] + 'ies')
179
- if _table_name.lower().endswith('ing'):
180
- table_names.add(_table_name.lower()[:-3])
181
- table_names.add(snakecase.lower()[:-3])
182
- table_names.add(camelcase.lower()[:-3])
183
-
184
- scores: list[tuple[str, int]] = []
185
- for col_name in candidates:
186
- col_name_lower = col_name.lower()
187
-
188
- score = 0
189
-
190
- if col_name_lower == 'id':
191
- score += 4
192
-
193
- for table_name_lower in table_names:
194
-
195
- if col_name_lower == table_name_lower:
196
- score += 4 # USER -> USER
197
- break
198
-
199
- for suffix in ['id', 'hash', 'key', 'code', 'uuid']:
200
- if not col_name_lower.endswith(suffix):
201
- continue
202
-
203
- if col_name_lower == f'{table_name_lower}_{suffix}':
204
- score += 5 # USER -> USER_ID
205
- break
206
-
207
- if col_name_lower == f'{table_name_lower}{suffix}':
208
- score += 5 # User -> UserId
209
- break
210
-
211
- if col_name_lower.endswith(f'{table_name_lower}_{suffix}'):
212
- score += 2
213
-
214
- if col_name_lower.endswith(f'{table_name_lower}{suffix}'):
215
- score += 2
216
-
217
- # `rel-bench` hard-coding :(
218
- if table_name == 'studies' and col_name == 'nct_id':
219
- score += 1
220
-
221
- ser = df[col_name].iloc[:1_000_000]
222
- score += 3 * (ser.nunique() / len(ser))
223
-
224
- scores.append((col_name, score))
225
-
226
- scores = [x for x in scores if x[-1] >= 4]
227
- scores.sort(key=lambda x: x[-1], reverse=True)
228
-
229
- if len(scores) == 0:
230
- return None
231
-
232
- if len(scores) == 1:
233
- return scores[0][0]
234
-
235
- # In case of multiple candidates, only return one if its score is unique:
236
- if scores[0][1] != scores[1][1]:
237
- return scores[0][0]
238
-
239
- max_score = max(scores, key=lambda x: x[1])
240
- candidates = [col_name for col_name, score in scores if score == max_score]
241
- warnings.warn(f"Found multiple potential primary keys in table "
242
- f"'{table_name}': {candidates}. Please specify the primary "
243
- f"key for this table manually.")
244
-
245
- return None
246
-
247
-
248
- def detect_time_column(
249
- df: pd.DataFrame,
250
- candidates: list[str],
251
- ) -> Optional[str]:
252
- r"""Auto-detect potential time column.
253
-
254
- Args:
255
- df: The pandas DataFrame to analyze
256
- candidates: A list of potential candidates.
257
-
258
- Returns:
259
- The name of the detected time column, or ``None`` if not found.
260
- """
261
- candidates = [ # Exclude all candidates with `*last*` in column names:
262
- col_name for col_name in candidates
263
- if not re.search(r'(^|_)last(_|$)', col_name, re.IGNORECASE)
264
- ]
265
-
266
- if len(candidates) == 0:
267
- return None
268
-
269
- if len(candidates) == 1:
270
- return candidates[0]
271
-
272
- # If there exists a dedicated `create*` column, use it as time column:
273
- create_candidates = [
274
- candidate for candidate in candidates
275
- if candidate.lower().startswith('create')
276
- ]
277
- if len(create_candidates) == 1:
278
- return create_candidates[0]
279
- if len(create_candidates) > 1:
280
- candidates = create_candidates
281
-
282
- # Find the most optimal time column. Usually, it is the one pointing to
283
- # the oldest timestamps:
284
- with warnings.catch_warnings():
285
- warnings.filterwarnings('ignore', message='Could not infer format')
286
- min_timestamp_dict = {
287
- key: pd.to_datetime(df[key].iloc[:10_000], 'coerce')
288
- for key in candidates
289
- }
290
- min_timestamp_dict = {
291
- key: value.min().tz_localize(None)
292
- for key, value in min_timestamp_dict.items()
293
- }
294
- min_timestamp_dict = {
295
- key: value
296
- for key, value in min_timestamp_dict.items() if not pd.isna(value)
297
- }
298
-
299
- if len(min_timestamp_dict) == 0:
300
- return None
301
-
302
- return min(min_timestamp_dict, key=min_timestamp_dict.get) # type: ignore
303
-
304
-
305
- PUNCTUATION = re.compile(r"[\'\"\.,\(\)\!\?\;\:]")
306
- MULTISPACE = re.compile(r"\s+")
307
-
308
-
309
- def normalize_text(
310
- ser: pd.Series,
311
- max_words: Optional[int] = 50,
312
- ) -> pd.Series:
313
- r"""Normalizes text into a list of lower-case words.
314
-
315
- Args:
316
- ser: The :class:`pandas.Series` to normalize.
317
- max_words: The maximum number of words to return.
318
- This will auto-shrink any large text column to avoid blowing up
319
- context size.
320
- """
321
- if len(ser) == 0 or pd.api.types.is_list_like(ser.iloc[0]):
322
- return ser
323
-
324
- def normalize_fn(line: str) -> list[str]:
325
- line = PUNCTUATION.sub(" ", line)
326
- line = re.sub(r"<br\s*/?>", " ", line) # Handle <br /> or <br>
327
- line = MULTISPACE.sub(" ", line)
328
- words = line.split()
329
- if max_words is not None:
330
- words = words[:max_words]
331
- return words
332
-
333
- ser = ser.fillna('').astype(str)
334
-
335
- if max_words is not None:
336
- # We estimate the number of words as 5 characters + 1 space in an
337
- # English text on average. We need this pre-filter here, as word
338
- # splitting on a giant text can be very expensive:
339
- ser = ser.str[:6 * max_words]
340
-
341
- ser = ser.str.lower()
342
- ser = ser.map(normalize_fn)
343
-
344
- return ser