openstat-cli 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. openstat/__init__.py +3 -0
  2. openstat/__main__.py +4 -0
  3. openstat/backends/__init__.py +16 -0
  4. openstat/backends/duckdb_backend.py +70 -0
  5. openstat/backends/polars_backend.py +52 -0
  6. openstat/cli.py +92 -0
  7. openstat/commands/__init__.py +82 -0
  8. openstat/commands/adv_stat_cmds.py +1255 -0
  9. openstat/commands/advanced_ml_cmds.py +576 -0
  10. openstat/commands/advreg_cmds.py +207 -0
  11. openstat/commands/alias_cmds.py +135 -0
  12. openstat/commands/arch_cmds.py +82 -0
  13. openstat/commands/arules_cmds.py +111 -0
  14. openstat/commands/automodel_cmds.py +212 -0
  15. openstat/commands/backend_cmds.py +82 -0
  16. openstat/commands/base.py +170 -0
  17. openstat/commands/bayes_cmds.py +71 -0
  18. openstat/commands/causal_cmds.py +269 -0
  19. openstat/commands/cluster_cmds.py +152 -0
  20. openstat/commands/data_cmds.py +996 -0
  21. openstat/commands/datamanip_cmds.py +672 -0
  22. openstat/commands/dataquality_cmds.py +174 -0
  23. openstat/commands/datetime_cmds.py +176 -0
  24. openstat/commands/dimreduce_cmds.py +184 -0
  25. openstat/commands/discrete_cmds.py +149 -0
  26. openstat/commands/dsl_cmds.py +143 -0
  27. openstat/commands/epi_cmds.py +93 -0
  28. openstat/commands/equiv_tobit_cmds.py +94 -0
  29. openstat/commands/esttab_cmds.py +196 -0
  30. openstat/commands/export_beamer_cmds.py +142 -0
  31. openstat/commands/export_cmds.py +201 -0
  32. openstat/commands/export_extra_cmds.py +240 -0
  33. openstat/commands/factor_cmds.py +180 -0
  34. openstat/commands/groupby_cmds.py +155 -0
  35. openstat/commands/help_cmds.py +237 -0
  36. openstat/commands/i18n_cmds.py +43 -0
  37. openstat/commands/import_extra_cmds.py +561 -0
  38. openstat/commands/influence_cmds.py +134 -0
  39. openstat/commands/iv_cmds.py +106 -0
  40. openstat/commands/manova_cmds.py +105 -0
  41. openstat/commands/mediate_cmds.py +233 -0
  42. openstat/commands/meta_cmds.py +284 -0
  43. openstat/commands/mi_cmds.py +228 -0
  44. openstat/commands/mixed_cmds.py +79 -0
  45. openstat/commands/mixture_changepoint_cmds.py +166 -0
  46. openstat/commands/ml_adv_cmds.py +147 -0
  47. openstat/commands/ml_cmds.py +178 -0
  48. openstat/commands/model_eval_cmds.py +142 -0
  49. openstat/commands/network_cmds.py +288 -0
  50. openstat/commands/nlquery_cmds.py +161 -0
  51. openstat/commands/nonparam_cmds.py +149 -0
  52. openstat/commands/outreg_cmds.py +247 -0
  53. openstat/commands/panel_cmds.py +141 -0
  54. openstat/commands/pdf_cmds.py +226 -0
  55. openstat/commands/pipeline_cmds.py +319 -0
  56. openstat/commands/plot_cmds.py +189 -0
  57. openstat/commands/plugin_cmds.py +79 -0
  58. openstat/commands/posthoc_cmds.py +153 -0
  59. openstat/commands/power_cmds.py +172 -0
  60. openstat/commands/profile_cmds.py +246 -0
  61. openstat/commands/rbridge_cmds.py +81 -0
  62. openstat/commands/regex_cmds.py +104 -0
  63. openstat/commands/report_cmds.py +48 -0
  64. openstat/commands/repro_cmds.py +129 -0
  65. openstat/commands/resampling_cmds.py +109 -0
  66. openstat/commands/reshape_cmds.py +223 -0
  67. openstat/commands/sem_cmds.py +177 -0
  68. openstat/commands/stat_cmds.py +1040 -0
  69. openstat/commands/stata_import_cmds.py +215 -0
  70. openstat/commands/string_cmds.py +124 -0
  71. openstat/commands/surv_cmds.py +145 -0
  72. openstat/commands/survey_cmds.py +153 -0
  73. openstat/commands/textanalysis_cmds.py +192 -0
  74. openstat/commands/ts_adv_cmds.py +136 -0
  75. openstat/commands/ts_cmds.py +195 -0
  76. openstat/commands/tui_cmds.py +111 -0
  77. openstat/commands/ux_cmds.py +191 -0
  78. openstat/commands/validate_cmds.py +270 -0
  79. openstat/commands/viz_adv_cmds.py +312 -0
  80. openstat/commands/viz_extra_cmds.py +251 -0
  81. openstat/commands/watch_cmds.py +69 -0
  82. openstat/config.py +106 -0
  83. openstat/dsl/__init__.py +0 -0
  84. openstat/dsl/parser.py +332 -0
  85. openstat/dsl/tokenizer.py +105 -0
  86. openstat/i18n.py +120 -0
  87. openstat/io/__init__.py +0 -0
  88. openstat/io/loader.py +187 -0
  89. openstat/jupyter/__init__.py +18 -0
  90. openstat/jupyter/display.py +18 -0
  91. openstat/jupyter/magic.py +60 -0
  92. openstat/logging_config.py +59 -0
  93. openstat/plots/__init__.py +0 -0
  94. openstat/plots/plotter.py +437 -0
  95. openstat/plots/surv_plots.py +32 -0
  96. openstat/plots/ts_plots.py +59 -0
  97. openstat/plugins/__init__.py +5 -0
  98. openstat/plugins/manager.py +69 -0
  99. openstat/repl.py +457 -0
  100. openstat/reporting/__init__.py +0 -0
  101. openstat/reporting/eda.py +208 -0
  102. openstat/reporting/report.py +67 -0
  103. openstat/script_runner.py +319 -0
  104. openstat/session.py +133 -0
  105. openstat/stats/__init__.py +0 -0
  106. openstat/stats/advanced_regression.py +269 -0
  107. openstat/stats/arch_garch.py +84 -0
  108. openstat/stats/bayesian.py +103 -0
  109. openstat/stats/causal.py +258 -0
  110. openstat/stats/clustering.py +206 -0
  111. openstat/stats/discrete.py +311 -0
  112. openstat/stats/epidemiology.py +119 -0
  113. openstat/stats/equiv_tobit.py +163 -0
  114. openstat/stats/factor.py +174 -0
  115. openstat/stats/imputation.py +282 -0
  116. openstat/stats/influence.py +78 -0
  117. openstat/stats/iv.py +131 -0
  118. openstat/stats/manova.py +124 -0
  119. openstat/stats/mixed.py +128 -0
  120. openstat/stats/ml.py +275 -0
  121. openstat/stats/ml_advanced.py +117 -0
  122. openstat/stats/model_eval.py +183 -0
  123. openstat/stats/models.py +1342 -0
  124. openstat/stats/nonparametric.py +130 -0
  125. openstat/stats/panel.py +179 -0
  126. openstat/stats/power.py +295 -0
  127. openstat/stats/resampling.py +203 -0
  128. openstat/stats/survey.py +213 -0
  129. openstat/stats/survival.py +196 -0
  130. openstat/stats/timeseries.py +142 -0
  131. openstat/stats/ts_advanced.py +114 -0
  132. openstat/types.py +11 -0
  133. openstat/web/__init__.py +1 -0
  134. openstat/web/app.py +117 -0
  135. openstat/web/session_manager.py +73 -0
  136. openstat/web/static/app.js +117 -0
  137. openstat/web/static/index.html +38 -0
  138. openstat/web/static/style.css +103 -0
  139. openstat_cli-1.0.0.dist-info/METADATA +748 -0
  140. openstat_cli-1.0.0.dist-info/RECORD +143 -0
  141. openstat_cli-1.0.0.dist-info/WHEEL +4 -0
  142. openstat_cli-1.0.0.dist-info/entry_points.txt +2 -0
  143. openstat_cli-1.0.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,191 @@
1
+ """UX helpers: bookmark, history search, timer, multiline."""
2
+
3
+ from __future__ import annotations
4
+ import time
5
+ from openstat.commands.base import command, CommandArgs, friendly_error
6
+ from openstat.session import Session
7
+
8
+
9
+ # ── Bookmark ─────────────────────────────────────────────────────────────────
10
+
11
+ @command("bookmark", usage="bookmark save|load|list|rm [name] [command]")
12
+ def cmd_bookmark(session: Session, args: str) -> str:
13
+ """Save and recall frequently used commands as bookmarks.
14
+
15
+ Sub-commands:
16
+ bookmark save <name> <command> — save a command as bookmark
17
+ bookmark load <name> — execute a bookmarked command
18
+ bookmark list — list all bookmarks
19
+ bookmark rm <name> — remove a bookmark
20
+
21
+ Examples:
22
+ bookmark save myols "ols income educ age"
23
+ bookmark load myols
24
+ bookmark list
25
+ bookmark rm myols
26
+ """
27
+ if not hasattr(session, "_bookmarks"):
28
+ session._bookmarks = {} # type: ignore[attr-defined]
29
+ bm: dict = session._bookmarks # type: ignore[attr-defined]
30
+
31
+ tokens = args.strip().split(None, 2)
32
+ if not tokens:
33
+ return "Usage: bookmark save|load|list|rm [name] [command]"
34
+
35
+ subcmd = tokens[0].lower()
36
+
37
+ if subcmd == "list":
38
+ if not bm:
39
+ return "No bookmarks saved. Use: bookmark save <name> <command>"
40
+ lines = ["Bookmarks:"]
41
+ for name, cmd in bm.items():
42
+ lines.append(f" {name:<20} {cmd}")
43
+ return "\n".join(lines)
44
+
45
+ if subcmd == "rm":
46
+ name = tokens[1] if len(tokens) > 1 else ""
47
+ if name in bm:
48
+ del bm[name]
49
+ return f"Bookmark '{name}' removed."
50
+ return f"Bookmark '{name}' not found."
51
+
52
+ if subcmd == "save":
53
+ if len(tokens) < 3:
54
+ return "Usage: bookmark save <name> <command>"
55
+ name = tokens[1]
56
+ cmd_str = tokens[2].strip("\"'")
57
+ bm[name] = cmd_str
58
+ return f"Bookmark '{name}' saved: {cmd_str}"
59
+
60
+ if subcmd == "load":
61
+ name = tokens[1] if len(tokens) > 1 else ""
62
+ if name not in bm:
63
+ return f"Bookmark '{name}' not found. Use 'bookmark list' to see saved."
64
+ cmd_str = bm[name]
65
+ from openstat.commands.base import run_command
66
+ return run_command(session, cmd_str)
67
+
68
+ return f"Unknown sub-command: {subcmd}"
69
+
70
+
71
+ # ── History search ────────────────────────────────────────────────────────────
72
+
73
+ @command("history search", usage="history search <pattern> [--n=20]")
74
+ def cmd_history_search(session: Session, args: str) -> str:
75
+ """Search command history for matching entries.
76
+
77
+ Options:
78
+ --n=<k> maximum results to show (default: 20)
79
+
80
+ Examples:
81
+ history search ols
82
+ history search "export" --n=10
83
+ history search plot
84
+ """
85
+ ca = CommandArgs(args)
86
+ if not ca.positional:
87
+ return "Usage: history search <pattern>"
88
+
89
+ pattern = " ".join(ca.positional).lower()
90
+ top_n = int(ca.options.get("n", 20))
91
+
92
+ history = getattr(session, "history", [])
93
+ if not history:
94
+ return "History is empty."
95
+
96
+ matches = [
97
+ (i + 1, cmd)
98
+ for i, cmd in enumerate(history)
99
+ if pattern in cmd.lower()
100
+ ]
101
+
102
+ if not matches:
103
+ return f"No history entries matching: {pattern}"
104
+
105
+ lines = [f"History search: '{pattern}' ({len(matches)} matches)"]
106
+ for idx, cmd in matches[-top_n:]:
107
+ lines.append(f" {idx:>5} {cmd}")
108
+ return "\n".join(lines)
109
+
110
+
111
+ @command("history show", usage="history show [--n=20]")
112
+ def cmd_history_show(session: Session, args: str) -> str:
113
+ """Show recent command history.
114
+
115
+ Options:
116
+ --n=<k> number of recent commands (default: 20)
117
+
118
+ Examples:
119
+ history show
120
+ history show --n=50
121
+ """
122
+ ca = CommandArgs(args)
123
+ top_n = int(ca.options.get("n", 20))
124
+ history = getattr(session, "history", [])
125
+ if not history:
126
+ return "History is empty."
127
+ recent = history[-top_n:]
128
+ offset = len(history) - len(recent)
129
+ lines = [f"Recent history ({len(recent)} of {len(history)} entries):"]
130
+ for i, cmd in enumerate(recent, offset + 1):
131
+ lines.append(f" {i:>5} {cmd}")
132
+ return "\n".join(lines)
133
+
134
+
135
+ # ── Timer ────────────────────────────────────────────────────────────────────
136
+
137
+ @command("timer", usage="timer <command with args>")
138
+ def cmd_timer(session: Session, args: str) -> str:
139
+ """Time the execution of a command.
140
+
141
+ Runs the specified command and reports wall-clock time.
142
+
143
+ Examples:
144
+ timer ols income educ age
145
+ timer bootstrap ols income educ age --reps=500
146
+ timer describe
147
+ """
148
+ if not args.strip():
149
+ return "Usage: timer <command> [args]"
150
+
151
+ from openstat.commands.base import run_command
152
+
153
+ start = time.perf_counter()
154
+ result = run_command(session, args.strip())
155
+ elapsed = time.perf_counter() - start
156
+
157
+ sep = "-" * 50
158
+ if elapsed < 1:
159
+ time_str = f"{elapsed * 1000:.1f} ms"
160
+ elif elapsed < 60:
161
+ time_str = f"{elapsed:.3f} s"
162
+ else:
163
+ m, s = divmod(elapsed, 60)
164
+ time_str = f"{int(m)}m {s:.1f}s"
165
+
166
+ parts = [result, "", sep, f"Elapsed: {time_str} (command: {args.strip()[:60]})"]
167
+ return "\n".join(parts)
168
+
169
+
170
+ # ── Multiline input helper ────────────────────────────────────────────────────
171
+
172
+ @command("multiline", usage="multiline")
173
+ def cmd_multiline(session: Session, args: str) -> str:
174
+ """Show instructions for multiline input in the REPL.
175
+
176
+ In the OpenStat REPL, you can use backslash continuation:
177
+ ols income educ age \\
178
+ --robust \\
179
+ --cluster=state
180
+
181
+ Or use a semicolon to chain commands on one line:
182
+ describe; summarize income educ
183
+
184
+ Or write commands to a .ost script file and run:
185
+ run my_analysis.ost
186
+
187
+ For interactive multiline, use the pipeline command:
188
+ pipeline define myflow "ols y x" | "plot coef" | "export pdf"
189
+ pipeline run myflow
190
+ """
191
+ return cmd_multiline.__doc__ or "See: pipeline define / run"
@@ -0,0 +1,270 @@
1
+ """Data validation commands: validate, fuzzyjoin."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re as _re
6
+
7
+ from openstat.commands.base import command, CommandArgs, friendly_error
8
+ from openstat.session import Session
9
+
10
+
11
+ # ── validate ─────────────────────────────────────────────────────────────────
12
+
13
+ @command("validate", usage="validate <col> <rule> [<rule> ...]")
14
+ def cmd_validate(session: Session, args: str) -> str:
15
+ """Validate a column against one or more rules.
16
+
17
+ Rules:
18
+ min=<N> — all values ≥ N
19
+ max=<N> — all values ≤ N
20
+ notnull — no missing values
21
+ unique — all values distinct
22
+ positive — all values > 0
23
+ nonneg — all values ≥ 0
24
+ regex=<pattern> — all values match regex (string columns)
25
+ oneof=a,b,c — all values in the allowed set
26
+ between=lo,hi — all values in [lo, hi]
27
+
28
+ Examples:
29
+ validate age min=0 max=120 notnull
30
+ validate gender oneof=male,female,other
31
+ validate email regex=^[^@]+@[^@]+\\.[^@]+$
32
+ validate score between=0,100
33
+ """
34
+ import polars as pl
35
+
36
+ ca = CommandArgs(args)
37
+ if len(ca.positional) < 2:
38
+ return "Usage: validate <col> <rule> [<rule> ...]"
39
+
40
+ col_name = ca.positional[0]
41
+ rules = ca.positional[1:]
42
+
43
+ try:
44
+ df = session.require_data()
45
+ if col_name not in df.columns:
46
+ return f"Column not found: {col_name}"
47
+
48
+ col = df[col_name]
49
+ n_total = df.height
50
+ failures: list[str] = []
51
+ passes: list[str] = []
52
+
53
+ NUMERIC = (pl.Float32, pl.Float64, pl.Int8, pl.Int16, pl.Int32, pl.Int64,
54
+ pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64)
55
+
56
+ for rule in rules:
57
+ rule = rule.strip()
58
+
59
+ if rule == "notnull":
60
+ n_miss = col.null_count()
61
+ if n_miss > 0:
62
+ failures.append(f"notnull: {n_miss} missing values")
63
+ else:
64
+ passes.append("notnull ✓")
65
+
66
+ elif rule == "unique":
67
+ n_dup = n_total - col.drop_nulls().n_unique()
68
+ if n_dup > 0:
69
+ failures.append(f"unique: {n_dup} duplicate values")
70
+ else:
71
+ passes.append("unique ✓")
72
+
73
+ elif rule == "positive":
74
+ if col.dtype not in NUMERIC:
75
+ failures.append("positive: column is not numeric")
76
+ else:
77
+ n_bad = col.drop_nulls().filter(col.drop_nulls() <= 0).len()
78
+ if n_bad:
79
+ failures.append(f"positive: {n_bad} values ≤ 0")
80
+ else:
81
+ passes.append("positive ✓")
82
+
83
+ elif rule == "nonneg":
84
+ if col.dtype not in NUMERIC:
85
+ failures.append("nonneg: column is not numeric")
86
+ else:
87
+ n_bad = col.drop_nulls().filter(col.drop_nulls() < 0).len()
88
+ if n_bad:
89
+ failures.append(f"nonneg: {n_bad} negative values")
90
+ else:
91
+ passes.append("nonneg ✓")
92
+
93
+ elif rule.startswith("min="):
94
+ val = float(rule[4:])
95
+ if col.dtype not in NUMERIC:
96
+ failures.append(f"min={val}: column is not numeric")
97
+ else:
98
+ n_bad = col.drop_nulls().filter(col.drop_nulls() < val).len()
99
+ if n_bad:
100
+ failures.append(f"min={val}: {n_bad} values below minimum")
101
+ else:
102
+ passes.append(f"min={val} ✓")
103
+
104
+ elif rule.startswith("max="):
105
+ val = float(rule[4:])
106
+ if col.dtype not in NUMERIC:
107
+ failures.append(f"max={val}: column is not numeric")
108
+ else:
109
+ n_bad = col.drop_nulls().filter(col.drop_nulls() > val).len()
110
+ if n_bad:
111
+ failures.append(f"max={val}: {n_bad} values above maximum")
112
+ else:
113
+ passes.append(f"max={val} ✓")
114
+
115
+ elif rule.startswith("between="):
116
+ parts = rule[8:].split(",")
117
+ if len(parts) != 2:
118
+ failures.append(f"between: invalid format (use between=lo,hi)")
119
+ continue
120
+ lo, hi = float(parts[0]), float(parts[1])
121
+ if col.dtype not in NUMERIC:
122
+ failures.append(f"between={lo},{hi}: column is not numeric")
123
+ else:
124
+ ser = col.drop_nulls()
125
+ n_bad = ser.filter((ser < lo) | (ser > hi)).len()
126
+ if n_bad:
127
+ failures.append(f"between={lo},{hi}: {n_bad} values out of range")
128
+ else:
129
+ passes.append(f"between={lo},{hi} ✓")
130
+
131
+ elif rule.startswith("oneof="):
132
+ allowed = set(rule[6:].split(","))
133
+ vals = col.drop_nulls().cast(pl.Utf8).to_list()
134
+ bad = [v for v in vals if v not in allowed]
135
+ if bad:
136
+ sample = bad[:5]
137
+ failures.append(f"oneof: {len(bad)} invalid values, e.g. {sample}")
138
+ else:
139
+ passes.append(f"oneof={rule[6:]} ✓")
140
+
141
+ elif rule.startswith("regex="):
142
+ pattern = rule[6:]
143
+ try:
144
+ compiled = _re.compile(pattern)
145
+ except _re.error as exc:
146
+ failures.append(f"regex: invalid pattern — {exc}")
147
+ continue
148
+ vals = col.drop_nulls().cast(pl.Utf8).to_list()
149
+ bad = [v for v in vals if not compiled.search(v)]
150
+ if bad:
151
+ sample = bad[:3]
152
+ failures.append(f"regex: {len(bad)} non-matching values, e.g. {sample}")
153
+ else:
154
+ passes.append(f"regex ✓")
155
+
156
+ else:
157
+ failures.append(f"Unknown rule: {rule}")
158
+
159
+ lines = [f"Validation: {col_name} (N={n_total})", "=" * 50]
160
+ for msg in passes:
161
+ lines.append(f" PASS {msg}")
162
+ for msg in failures:
163
+ lines.append(f" FAIL {msg}")
164
+ lines.append("=" * 50)
165
+ if failures:
166
+ lines.append(f"Result: {len(failures)} check(s) FAILED, {len(passes)} passed")
167
+ else:
168
+ lines.append(f"Result: All {len(passes)} check(s) PASSED")
169
+ return "\n".join(lines)
170
+
171
+ except Exception as e:
172
+ return friendly_error(e, "validate")
173
+
174
+
175
+ # ── fuzzyjoin ────────────────────────────────────────────────────────────────
176
+
177
+ @command("fuzzyjoin", usage="fuzzyjoin <other_file> on(<col>) [--threshold=80] [--method=ratio]")
178
+ def cmd_fuzzyjoin(session: Session, args: str) -> str:
179
+ """Fuzzy (approximate) string join with another dataset.
180
+
181
+ Matches rows by similarity of a string column, useful for messy data.
182
+
183
+ Options:
184
+ on(<col>) — column to match on (must exist in both datasets)
185
+ --threshold=80 — minimum similarity score (0-100, default 80)
186
+ --method=ratio — scoring: ratio, partial_ratio, token_sort_ratio
187
+
188
+ Examples:
189
+ fuzzyjoin companies.csv on(name) --threshold=85
190
+ fuzzyjoin lookup.parquet on(city) --method=token_sort_ratio
191
+ """
192
+ try:
193
+ from rapidfuzz import fuzz, process
194
+ except ImportError:
195
+ return (
196
+ "rapidfuzz is required for fuzzyjoin.\n"
197
+ "Install: pip install rapidfuzz"
198
+ )
199
+
200
+ import polars as pl
201
+ from openstat.io.loader import load_file
202
+
203
+ ca = CommandArgs(args)
204
+ if not ca.positional:
205
+ return "Usage: fuzzyjoin <other_file> on(<col>) [--threshold=80]"
206
+
207
+ other_path = ca.positional[0]
208
+ on_raw = ca.rest_after("on")
209
+ if not on_raw:
210
+ return "Specify join column: on(<col>)"
211
+ on_col = on_raw.strip().strip("()")
212
+
213
+ threshold = float(ca.options.get("threshold", 80))
214
+ method_name = ca.options.get("method", "ratio")
215
+ scorer = {
216
+ "ratio": fuzz.ratio,
217
+ "partial_ratio": fuzz.partial_ratio,
218
+ "token_sort_ratio": fuzz.token_sort_ratio,
219
+ }.get(method_name, fuzz.ratio)
220
+
221
+ try:
222
+ df_left = session.require_data()
223
+ df_right = load_file(other_path)
224
+
225
+ if on_col not in df_left.columns:
226
+ return f"Column '{on_col}' not in current dataset."
227
+ if on_col not in df_right.columns:
228
+ return f"Column '{on_col}' not in {other_path}."
229
+
230
+ left_vals = df_left[on_col].cast(pl.Utf8).to_list()
231
+ right_vals = df_right[on_col].cast(pl.Utf8).to_list()
232
+
233
+ # For each left value find best match in right
234
+ best_match = []
235
+ best_score = []
236
+ for lv in left_vals:
237
+ if lv is None:
238
+ best_match.append(None)
239
+ best_score.append(0.0)
240
+ continue
241
+ result = process.extractOne(lv, right_vals, scorer=scorer)
242
+ if result and result[1] >= threshold:
243
+ best_match.append(result[0])
244
+ best_score.append(float(result[1]))
245
+ else:
246
+ best_match.append(None)
247
+ best_score.append(float(result[1]) if result else 0.0)
248
+
249
+ df_left = df_left.with_columns([
250
+ pl.Series("_fuzzy_match", best_match),
251
+ pl.Series("_fuzzy_score", best_score),
252
+ ])
253
+
254
+ # Join right dataset on the matched value
255
+ df_right_renamed = df_right.rename(
256
+ {c: f"_r_{c}" if c != on_col else c for c in df_right.columns}
257
+ ).rename({on_col: "_fuzzy_match"})
258
+
259
+ result = df_left.join(df_right_renamed, on="_fuzzy_match", how="left")
260
+ session.snapshot()
261
+ session.df = result
262
+
263
+ n_matched = sum(1 for s in best_score if s >= threshold)
264
+ return (
265
+ f"Fuzzy join complete. {n_matched}/{df_left.height} rows matched "
266
+ f"(threshold={threshold:.0f}). New shape: {session.shape_str}"
267
+ )
268
+
269
+ except Exception as e:
270
+ return friendly_error(e, "fuzzyjoin")