visidata 2.11.dev0__py3-none-any.whl → 3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- visidata/__init__.py +72 -91
- visidata/_input.py +263 -44
- visidata/_open.py +84 -29
- visidata/_types.py +22 -4
- visidata/_urlcache.py +17 -4
- visidata/aggregators.py +65 -25
- visidata/apps/__init__.py +0 -0
- visidata/apps/vdsql/__about__.py +8 -0
- visidata/apps/vdsql/__init__.py +5 -0
- visidata/apps/vdsql/__main__.py +27 -0
- visidata/apps/vdsql/_ibis.py +748 -0
- visidata/apps/vdsql/bigquery.py +61 -0
- visidata/apps/vdsql/clickhouse.py +53 -0
- visidata/apps/vdsql/setup.py +40 -0
- visidata/apps/vdsql/snowflake.py +67 -0
- visidata/apps/vgit/__init__.py +13 -0
- visidata/apps/vgit/__main__.py +3 -0
- visidata/apps/vgit/abort.py +23 -0
- visidata/apps/vgit/blame.py +76 -0
- visidata/apps/vgit/branch.py +153 -0
- visidata/apps/vgit/config.py +95 -0
- visidata/apps/vgit/diff.py +169 -0
- visidata/apps/vgit/gitsheet.py +161 -0
- visidata/apps/vgit/grep.py +37 -0
- visidata/apps/vgit/log.py +81 -0
- visidata/apps/vgit/main.py +55 -0
- visidata/apps/vgit/remote.py +57 -0
- visidata/apps/vgit/repos.py +71 -0
- visidata/apps/vgit/setup.py +37 -0
- visidata/apps/vgit/stash.py +69 -0
- visidata/apps/vgit/status.py +204 -0
- visidata/apps/vgit/statusbar.py +34 -0
- visidata/basesheet.py +59 -50
- visidata/canvas.py +251 -99
- visidata/choose.py +15 -11
- visidata/clean_names.py +29 -0
- visidata/clipboard.py +84 -18
- visidata/cliptext.py +220 -46
- visidata/cmdlog.py +89 -114
- visidata/color.py +142 -56
- visidata/column.py +134 -131
- visidata/ddw/input.ddw +74 -79
- visidata/ddw/regex.ddw +57 -0
- visidata/ddwplay.py +33 -14
- visidata/deprecated.py +77 -3
- visidata/desktop/visidata.desktop +7 -0
- visidata/editor.py +12 -6
- visidata/errors.py +5 -1
- visidata/experimental/__init__.py +0 -0
- visidata/experimental/diff_sheet.py +29 -0
- visidata/experimental/digit_autoedit.py +6 -0
- visidata/experimental/gdrive.py +89 -0
- visidata/experimental/google.py +37 -0
- visidata/experimental/gsheets.py +79 -0
- visidata/experimental/live_search.py +37 -0
- visidata/experimental/liveupdate.py +45 -0
- visidata/experimental/mark.py +133 -0
- visidata/experimental/noahs_tapestry/__init__.py +1 -0
- visidata/experimental/noahs_tapestry/tapestry.py +147 -0
- visidata/experimental/rownum.py +73 -0
- visidata/experimental/slide_cells.py +26 -0
- visidata/expr.py +8 -4
- visidata/extensible.py +32 -6
- visidata/features/__init__.py +0 -0
- visidata/features/addcol_audiometadata.py +42 -0
- visidata/features/addcol_histogram.py +34 -0
- visidata/features/canvas_save_svg.py +69 -0
- visidata/features/change_precision.py +46 -0
- visidata/features/cmdpalette.py +163 -0
- visidata/features/colorbrewer.py +363 -0
- visidata/{colorsheet.py → features/colorsheet.py} +17 -16
- visidata/features/command_server.py +105 -0
- visidata/features/currency_to_usd.py +70 -0
- visidata/{customdate.py → features/customdate.py} +2 -0
- visidata/features/dedupe.py +132 -0
- visidata/{describe.py → features/describe.py} +17 -15
- visidata/features/errors_guide.py +26 -0
- visidata/features/expand_cols.py +202 -0
- visidata/{fill.py → features/fill.py} +4 -2
- visidata/{freeze.py → features/freeze.py} +11 -6
- visidata/features/graph_seaborn.py +79 -0
- visidata/features/helloworld.py +10 -0
- visidata/features/hint_types.py +17 -0
- visidata/{incr.py → features/incr.py} +5 -0
- visidata/{join.py → features/join.py} +107 -53
- visidata/features/known_cols.py +21 -0
- visidata/features/layout.py +62 -0
- visidata/{melt.py → features/melt.py} +33 -21
- visidata/features/normcol.py +118 -0
- visidata/features/open_config.py +7 -0
- visidata/features/open_syspaste.py +18 -0
- visidata/features/ping.py +157 -0
- visidata/features/procmgr.py +208 -0
- visidata/features/random_sample.py +6 -0
- visidata/{regex.py → features/regex.py} +47 -31
- visidata/features/reload_every.py +55 -0
- visidata/features/rename_col_cascade.py +30 -0
- visidata/features/scroll_context.py +60 -0
- visidata/features/select_equal_selected.py +11 -0
- visidata/features/setcol_fake.py +65 -0
- visidata/{slide.py → features/slide.py} +75 -21
- visidata/features/sparkline.py +48 -0
- visidata/features/status_source.py +20 -0
- visidata/{sysedit.py → features/sysedit.py} +2 -1
- visidata/features/sysopen_mailcap.py +46 -0
- visidata/features/term_extras.py +13 -0
- visidata/{transpose.py → features/transpose.py} +5 -4
- visidata/features/type_ipaddr.py +73 -0
- visidata/features/type_url.py +11 -0
- visidata/{unfurl.py → features/unfurl.py} +9 -9
- visidata/{window.py → features/window.py} +2 -2
- visidata/form.py +50 -21
- visidata/freqtbl.py +81 -33
- visidata/fuzzymatch.py +414 -0
- visidata/graph.py +105 -33
- visidata/guide.py +180 -0
- visidata/help.py +75 -44
- visidata/hint.py +39 -0
- visidata/indexsheet.py +109 -0
- visidata/input_history.py +55 -0
- visidata/interface.py +58 -0
- visidata/keys.py +17 -16
- visidata/loaders/__init__.py +9 -0
- visidata/loaders/_pandas.py +61 -21
- visidata/loaders/api_airtable.py +70 -0
- visidata/loaders/api_bitio.py +102 -0
- visidata/loaders/api_matrix.py +148 -0
- visidata/loaders/api_reddit.py +306 -0
- visidata/loaders/api_zulip.py +249 -0
- visidata/loaders/archive.py +41 -7
- visidata/loaders/arrow.py +7 -7
- visidata/loaders/conll.py +49 -0
- visidata/loaders/csv.py +25 -7
- visidata/loaders/eml.py +3 -4
- visidata/loaders/f5log.py +1204 -0
- visidata/loaders/fec.py +325 -0
- visidata/loaders/fixed_width.py +3 -5
- visidata/loaders/frictionless.py +3 -3
- visidata/loaders/geojson.py +8 -5
- visidata/loaders/google.py +48 -0
- visidata/loaders/graphviz.py +4 -4
- visidata/loaders/hdf5.py +4 -4
- visidata/loaders/html.py +48 -10
- visidata/loaders/http.py +84 -30
- visidata/loaders/imap.py +20 -10
- visidata/loaders/jrnl.py +52 -0
- visidata/loaders/json.py +83 -29
- visidata/loaders/jsonla.py +74 -0
- visidata/loaders/lsv.py +15 -11
- visidata/loaders/mailbox.py +40 -0
- visidata/loaders/markdown.py +1 -3
- visidata/loaders/mbtiles.py +4 -5
- visidata/loaders/mysql.py +11 -13
- visidata/loaders/npy.py +7 -7
- visidata/loaders/odf.py +4 -1
- visidata/loaders/orgmode.py +428 -0
- visidata/loaders/pandas_freqtbl.py +14 -20
- visidata/loaders/parquet.py +62 -6
- visidata/loaders/pcap.py +3 -3
- visidata/loaders/pdf.py +4 -3
- visidata/loaders/png.py +19 -13
- visidata/loaders/postgres.py +9 -8
- visidata/loaders/rec.py +7 -3
- visidata/loaders/s3.py +342 -0
- visidata/loaders/sas.py +5 -5
- visidata/loaders/scrape.py +186 -0
- visidata/loaders/shp.py +6 -5
- visidata/loaders/spss.py +5 -6
- visidata/loaders/sqlite.py +68 -28
- visidata/loaders/texttables.py +1 -1
- visidata/loaders/toml.py +60 -0
- visidata/loaders/tsv.py +61 -19
- visidata/loaders/ttf.py +19 -7
- visidata/loaders/unzip_http.py +6 -5
- visidata/loaders/usv.py +1 -1
- visidata/loaders/vcf.py +16 -16
- visidata/loaders/vds.py +10 -7
- visidata/loaders/vdx.py +30 -5
- visidata/loaders/xlsb.py +8 -1
- visidata/loaders/xlsx.py +145 -25
- visidata/loaders/xml.py +6 -3
- visidata/loaders/xword.py +4 -4
- visidata/loaders/yaml.py +15 -5
- visidata/macos.py +1 -1
- visidata/macros.py +130 -41
- visidata/main.py +119 -94
- visidata/mainloop.py +101 -154
- visidata/man/parse_options.py +2 -2
- visidata/man/vd.1 +302 -147
- visidata/man/vd.txt +291 -151
- visidata/memory.py +3 -3
- visidata/menu.py +104 -423
- visidata/metasheets.py +59 -141
- visidata/modify.py +79 -23
- visidata/motd.py +3 -3
- visidata/mouse.py +137 -0
- visidata/movement.py +43 -35
- visidata/optionssheet.py +99 -0
- visidata/path.py +131 -43
- visidata/pivot.py +74 -47
- visidata/plugins.py +65 -192
- visidata/pyobj.py +50 -201
- visidata/rename_col.py +20 -0
- visidata/save.py +42 -20
- visidata/search.py +54 -10
- visidata/selection.py +84 -5
- visidata/settings.py +162 -24
- visidata/sheets.py +229 -257
- visidata/shell.py +51 -21
- visidata/sidebar.py +162 -0
- visidata/sort.py +11 -4
- visidata/statusbar.py +113 -104
- visidata/stored_list.py +43 -0
- visidata/stored_prop.py +38 -0
- visidata/tests/conftest.py +3 -3
- visidata/tests/test_cliptext.py +39 -0
- visidata/tests/test_commands.py +62 -7
- visidata/tests/test_edittext.py +2 -2
- visidata/tests/test_features.py +17 -0
- visidata/tests/test_menu.py +14 -0
- visidata/tests/test_path.py +13 -4
- visidata/text_source.py +53 -0
- visidata/textsheet.py +10 -3
- visidata/theme.py +44 -0
- visidata/themes/__init__.py +0 -0
- visidata/themes/ascii8.py +84 -0
- visidata/themes/asciimono.py +84 -0
- visidata/themes/light.py +17 -0
- visidata/threads.py +87 -39
- visidata/tuiwin.py +22 -0
- visidata/type_currency.py +22 -3
- visidata/type_date.py +31 -9
- visidata/type_floatsi.py +5 -1
- visidata/undo.py +18 -6
- visidata/utils.py +106 -23
- visidata/vdobj.py +28 -17
- visidata/windows.py +10 -0
- visidata/wrappers.py +9 -3
- visidata-3.0.data/data/share/applications/visidata.desktop +7 -0
- {visidata-2.11.dev0.data → visidata-3.0.data}/data/share/man/man1/vd.1 +302 -147
- {visidata-2.11.dev0.data → visidata-3.0.data}/data/share/man/man1/visidata.1 +302 -147
- visidata-3.0.data/scripts/vd2to3.vdx +9 -0
- {visidata-2.11.dev0.dist-info → visidata-3.0.dist-info}/METADATA +13 -11
- visidata-3.0.dist-info/RECORD +257 -0
- {visidata-2.11.dev0.dist-info → visidata-3.0.dist-info}/WHEEL +1 -1
- {visidata-2.11.dev0.dist-info → visidata-3.0.dist-info}/entry_points.txt +0 -1
- visidata/layout.py +0 -44
- visidata/misc.py +0 -5
- visidata-2.11.dev0.dist-info/RECORD +0 -142
- /visidata/{repeat.py → features/repeat.py} +0 -0
- {visidata-2.11.dev0.data → visidata-3.0.data}/scripts/vd +0 -0
- {visidata-2.11.dev0.dist-info → visidata-3.0.dist-info}/LICENSE.gpl3 +0 -0
- {visidata-2.11.dev0.dist-info → visidata-3.0.dist-info}/top_level.txt +0 -0
visidata/freqtbl.py
CHANGED
@@ -5,8 +5,7 @@ from visidata import vd, asyncthread, vlen, VisiData, Column, AttrColumn, Sheet,
|
|
5
5
|
from visidata.pivot import PivotSheet, PivotGroupRow
|
6
6
|
|
7
7
|
|
8
|
-
vd.
|
9
|
-
vd.option('disp_histolen', 50, 'width of histogram column')
|
8
|
+
vd.theme_option('disp_histogram', '■', 'histogram element character')
|
10
9
|
vd.option('histogram_bins', 0, 'number of bins for histogram of numeric columns')
|
11
10
|
vd.option('numeric_binning', False, 'bin numeric columns into ranges', replay=True)
|
12
11
|
|
@@ -14,7 +13,7 @@ vd.option('numeric_binning', False, 'bin numeric columns into ranges', replay=Tr
|
|
14
13
|
@VisiData.api
|
15
14
|
def valueNames(vd, discrete_vals, numeric_vals):
|
16
15
|
ret = [ '+'.join(str(x) for x in discrete_vals) ]
|
17
|
-
if numeric_vals != (0, 0):
|
16
|
+
if isinstance(numeric_vals, tuple) and numeric_vals != (0, 0):
|
18
17
|
ret.append('%s-%s' % numeric_vals)
|
19
18
|
|
20
19
|
return '+'.join(ret)
|
@@ -22,18 +21,37 @@ def valueNames(vd, discrete_vals, numeric_vals):
|
|
22
21
|
class HistogramColumn(Column):
|
23
22
|
def calcValue(col, row):
|
24
23
|
histogram = col.sheet.options.disp_histogram
|
25
|
-
histolen = col.
|
24
|
+
histolen = col.width-2
|
26
25
|
return histogram*(histolen*len(row.sourcerows)//col.sheet.largest)
|
27
26
|
|
28
27
|
|
28
|
+
def makeFreqTable(sheet, *groupByCols):
|
29
|
+
return FreqTableSheet(sheet.name,
|
30
|
+
'%s_freq' % '-'.join(col.name for col in groupByCols),
|
31
|
+
groupByCols=groupByCols,
|
32
|
+
source=sheet)
|
33
|
+
|
34
|
+
|
29
35
|
class FreqTableSheet(PivotSheet):
|
30
36
|
'Generate frequency-table sheet on currently selected column.'
|
37
|
+
guide = '''# Frequency Table
|
38
|
+
This is a *frequency analysis* of _{sheet.groupByColsName}_ from the *{sheet.groupByCols[0].sheet}* sheet.
|
39
|
+
|
40
|
+
Each row on this sheet corresponds to a *bin* of rows on the source sheet that have a distinct value. The _count_ and _percent_ columns show how many rows on the source sheet are in this bin.
|
41
|
+
|
42
|
+
- `Enter` to open a copy of the source sheet, with only the rows in the current bin.
|
43
|
+
- `g Enter` to open a copy of the source sheet, with a combination of the rows from all selected bins.
|
44
|
+
|
45
|
+
## Tips
|
46
|
+
|
47
|
+
- Use `+` on the source sheet, to add aggregators on other columns, and those metrics will appear as separate columns here.
|
48
|
+
- Selecting bins on this sheet will select those rows on the source sheet.
|
49
|
+
'''
|
31
50
|
rowtype = 'bins' # rowdef FreqRow(keys, sourcerows)
|
32
51
|
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
self.largest = 1
|
52
|
+
@property
|
53
|
+
def groupByColsName(self):
|
54
|
+
return '+'.join(c.name for c in self.groupByCols)
|
37
55
|
|
38
56
|
def selectRow(self, row):
|
39
57
|
self.source.select(row.sourcerows) # select all entries in the bin on the source sheet
|
@@ -46,10 +64,8 @@ class FreqTableSheet(PivotSheet):
|
|
46
64
|
def updateLargest(self, grouprow):
|
47
65
|
self.largest = max(self.largest, len(grouprow.sourcerows))
|
48
66
|
|
49
|
-
|
50
|
-
|
51
|
-
'Generate frequency table then reverse-sort by length.'
|
52
|
-
super().initCols()
|
67
|
+
def resetCols(self):
|
68
|
+
super().resetCols()
|
53
69
|
|
54
70
|
# add default bonus columns
|
55
71
|
for c in [
|
@@ -58,34 +74,39 @@ class FreqTableSheet(PivotSheet):
|
|
58
74
|
]:
|
59
75
|
self.addColumn(c)
|
60
76
|
|
61
|
-
if self.options.
|
62
|
-
c = HistogramColumn('histogram', type=str, width=self.options.
|
77
|
+
if self.options.disp_histogram:
|
78
|
+
c = HistogramColumn('histogram', type=str, width=self.options.default_width*2)
|
63
79
|
self.addColumn(c)
|
64
80
|
|
81
|
+
# if non-numeric grouping, reverse sort by count at end of load
|
82
|
+
if not any(vd.isNumeric(c) for c in self.groupByCols):
|
83
|
+
self._ordering = [('count', True)]
|
84
|
+
|
85
|
+
def loader(self):
|
86
|
+
'Generate frequency table.'
|
65
87
|
# two more threads
|
66
88
|
vd.sync(self.addAggregateCols(),
|
67
89
|
self.groupRows(self.updateLargest))
|
68
90
|
|
91
|
+
def afterLoad(self):
|
92
|
+
super().afterLoad()
|
69
93
|
if self.nCols > len(self.groupByCols)+3: # hide percent/histogram if aggregations added
|
70
94
|
self.column('percent').hide()
|
71
95
|
self.column('histogram').hide()
|
72
96
|
|
73
|
-
if not [c for c in self.groupByCols if vd.isNumeric(c)]:
|
74
|
-
self.orderBy(self.column('count'), reverse=True)
|
75
|
-
|
76
97
|
def openRow(self, row):
|
77
98
|
'open copy of source sheet with rows that are grouped in current row'
|
78
99
|
if row.sourcerows:
|
79
100
|
vs = copy(self.source)
|
80
|
-
vs.
|
101
|
+
vs.names = vs.names + [vd.valueNames(row.discrete_keys, row.numeric_key)]
|
81
102
|
vs.rows=copy(row.sourcerows)
|
82
|
-
vs.source=self.source
|
83
103
|
return vs
|
84
104
|
vd.warning("no source rows")
|
85
105
|
|
86
106
|
def openRows(self, rows):
|
87
107
|
vs = copy(self.source)
|
88
|
-
vs.
|
108
|
+
vs.names = vs.names + ["several"]
|
109
|
+
vs.source = self
|
89
110
|
vs.rows = list(itertools.chain.from_iterable(row.sourcerows for row in rows))
|
90
111
|
return vs
|
91
112
|
|
@@ -94,23 +115,50 @@ class FreqTableSheet(PivotSheet):
|
|
94
115
|
|
95
116
|
|
96
117
|
class FreqTableSheetSummary(FreqTableSheet):
|
97
|
-
'Append a PivotGroupRow to
|
98
|
-
|
99
|
-
def reload(self):
|
100
|
-
FreqTableSheet.reload.__wrapped__(self)
|
118
|
+
'Append a PivotGroupRow to FreqTableSheet with only selectedRows.'
|
119
|
+
def afterLoad(self):
|
101
120
|
self.addRow(PivotGroupRow(['Selected'], (0,0), self.source.selectedRows, {}))
|
121
|
+
super().afterLoad()
|
102
122
|
|
103
|
-
Sheet.addCommand('F', 'freq-col', 'vd.push(FreqTableSheet(sheet, cursorCol))', 'open Frequency Table grouped on current column, with aggregations of other columns')
|
104
|
-
Sheet.addCommand('gF', 'freq-keys', 'vd.push(FreqTableSheet(sheet, *keyCols))', 'open Frequency Table grouped by all key columns on source sheet, with aggregations of other columns')
|
105
|
-
Sheet.addCommand('zF', 'freq-summary', 'vd.push(FreqTableSheetSummary(sheet, Column("Total", sheet=sheet, getter=lambda col, row: "Total")))', 'open one-line summary for all rows and selected rows')
|
106
123
|
|
107
|
-
|
124
|
+
def makeFreqTableSheetSummary(sheet, *groupByCols):
|
125
|
+
return FreqTableSheetSummary(sheet.name,
|
126
|
+
'%s_freq' % '-'.join(col.name for col in groupByCols),
|
127
|
+
groupByCols=groupByCols,
|
128
|
+
source=sheet)
|
129
|
+
|
130
|
+
|
131
|
+
@VisiData.api
|
132
|
+
class FreqTablePreviewSheet(Sheet):
|
133
|
+
@property
|
134
|
+
def rows(self):
|
135
|
+
return self.source.cursorRow.sourcerows
|
136
|
+
|
137
|
+
|
138
|
+
FreqTableSheet.addCommand('', 'open-preview', 'vd.push(FreqTablePreviewSheet(sheet.name, "preview", source=sheet, columns=source.columns), pane=2); vd.options.disp_splitwin_pct=50', 'open split preview of source rows at cursor')
|
139
|
+
|
140
|
+
Sheet.addCommand('F', 'freq-col', 'vd.push(makeFreqTable(sheet, cursorCol))', 'open Frequency Table grouped on current column, with aggregations of other columns')
|
141
|
+
Sheet.addCommand('gF', 'freq-keys', 'vd.push(makeFreqTable(sheet, *keyCols))', 'open Frequency Table grouped by all key columns on source sheet, with aggregations of other columns')
|
142
|
+
Sheet.addCommand('zF', 'freq-summary', 'vd.push(makeFreqTableSheetSummary(sheet, Column("Total", sheet=sheet, getter=lambda col, row: "Total")))', 'open one-line summary for all rows and selected rows')
|
143
|
+
|
144
|
+
ColumnsSheet.addCommand(ENTER, 'freq-row', 'vd.push(makeFreqTable(source[0], cursorRow))', 'open a Frequency Table sheet grouped on column referenced in current row')
|
108
145
|
vd.addMenuItem('Data', 'Frequency table', 'current row', 'freq-row')
|
109
146
|
|
110
147
|
FreqTableSheet.addCommand('gu', 'unselect-rows', 'unselect(selectedRows)', 'unselect all source rows grouped in current row')
|
111
148
|
FreqTableSheet.addCommand('g'+ENTER, 'dive-selected', 'vd.push(openRows(selectedRows))', 'open copy of source sheet with rows that are grouped in selected rows')
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
149
|
+
FreqTableSheet.addCommand('', 'select-first', 'for r in rows: source.select([r.sourcerows[0]])', 'select first source row in each bin')
|
150
|
+
|
151
|
+
FreqTableSheet.init('largest', lambda: 1)
|
152
|
+
|
153
|
+
vd.addGlobals(
|
154
|
+
makeFreqTable=makeFreqTable,
|
155
|
+
makeFreqTableSheetSummary=makeFreqTableSheetSummary,
|
156
|
+
FreqTableSheet=FreqTableSheet,
|
157
|
+
FreqTableSheetSummary=FreqTableSheetSummary,
|
158
|
+
HistogramColumn=HistogramColumn,
|
159
|
+
)
|
160
|
+
|
161
|
+
vd.addMenuItems('''
|
162
|
+
Data > Frequency table > current column > freq-col
|
163
|
+
Data > Frequency table > key columns > freq-keys
|
164
|
+
''')
|
visidata/fuzzymatch.py
ADDED
@@ -0,0 +1,414 @@
|
|
1
|
+
''' Fuzzy String Matching.
|
2
|
+
|
3
|
+
This module is a pretty verbatim Python port of fzf's FuzzyMatchV2
|
4
|
+
trimmed down to a basic usecase of matching ASCII strings case sensitively.
|
5
|
+
|
6
|
+
For more information check out the source, I have not bothered to copy
|
7
|
+
the introductory comment/documentation:
|
8
|
+
|
9
|
+
https://github.com/junegunn/fzf/blob/b1a0ab8086/src/algo/algo.go
|
10
|
+
|
11
|
+
'''
|
12
|
+
import collections
|
13
|
+
from dataclasses import dataclass
|
14
|
+
from enum import Enum
|
15
|
+
from visidata import VisiData, vd
|
16
|
+
|
17
|
+
# Overwrite to true to get some diagnostic visualization
|
18
|
+
DEBUG = False
|
19
|
+
|
20
|
+
scoreMatch = 16
|
21
|
+
scoreGapStart = -3
|
22
|
+
scoreGapExtension = -1
|
23
|
+
|
24
|
+
# We prefer matches at the beginning of a word, but the bonus should not be
|
25
|
+
# too great to prevent the longer acronym matches from always winning over
|
26
|
+
# shorter fuzzy matches. The bonus point here was specifically chosen that
|
27
|
+
# the bonus is cancelled when the gap between the acronyms grows over
|
28
|
+
# 8 characters, which is approximately the average length of the words found
|
29
|
+
# in web2 dictionary and my file system.
|
30
|
+
bonusBoundary = scoreMatch / 2
|
31
|
+
|
32
|
+
# Although bonus point for non-word characters is non-contextual, we need it
|
33
|
+
# for computing bonus points for consecutive chunks starting with a non-word
|
34
|
+
# character.
|
35
|
+
bonusNonWord = scoreMatch / 2
|
36
|
+
|
37
|
+
# Edge-triggered bonus for matches in camelCase words.
|
38
|
+
# Compared to word-boundary case, they don't accompany single-character gaps
|
39
|
+
# (e.g. FooBar vs. foo-bar), so we deduct bonus point accordingly.
|
40
|
+
bonusCamel123 = bonusBoundary + scoreGapExtension
|
41
|
+
|
42
|
+
# Minimum bonus point given to characters in consecutive chunks.
|
43
|
+
# Note that bonus points for consecutive matches shouldn't have needed if we
|
44
|
+
# used fixed match score as in the original algorithm.
|
45
|
+
bonusConsecutive = -(scoreGapStart + scoreGapExtension)
|
46
|
+
|
47
|
+
# The first character in the typed pattern usually has more significance
|
48
|
+
# than the rest so it's important that it appears at special positions where
|
49
|
+
# bonus points are given, e.g. 'to-go' vs. 'ongoing' on 'og' or on 'ogo'.
|
50
|
+
# The amount of the extra bonus should be limited so that the gap penalty is
|
51
|
+
# still respected.
|
52
|
+
bonusFirstCharMultiplier = 2
|
53
|
+
|
54
|
+
# Extra bonus for word boundary after whitespace character or beginning of the string
|
55
|
+
bonusBoundaryWhite = bonusBoundary + 2
|
56
|
+
|
57
|
+
# Extra bonus for word boundary after slash, colon, semi-colon, and comma
|
58
|
+
bonusBoundaryDelimiter = bonusBoundary + 1
|
59
|
+
|
60
|
+
delimiterChars = '/,:;|'
|
61
|
+
|
62
|
+
vd.theme_option('color_match', 'red', 'color for matching chars in palette chooser')
|
63
|
+
|
64
|
+
whiteChars = ' \t\n\v\f\r\x85\xA0'
|
65
|
+
|
66
|
+
(
|
67
|
+
charWhite,
|
68
|
+
charNonWord,
|
69
|
+
charDelimiter,
|
70
|
+
charLower,
|
71
|
+
charUpper,
|
72
|
+
charLetter,
|
73
|
+
charNumber,
|
74
|
+
) = range(7)
|
75
|
+
initialCharClass = charWhite
|
76
|
+
|
77
|
+
|
78
|
+
def asciiFuzzyIndex(target, pattern):
|
79
|
+
'''Return a fuzzy* starting position of the pattern,
|
80
|
+
or -1, if pattern isn't a fuzzy match.
|
81
|
+
|
82
|
+
*the position is adapted one back, if possible,
|
83
|
+
for bonus determination reasons.
|
84
|
+
'''
|
85
|
+
first_idx, idx = 0, 0
|
86
|
+
for pidx in range(len(pattern)):
|
87
|
+
idx = target.find(pattern[pidx], idx)
|
88
|
+
if idx < 0:
|
89
|
+
return -1
|
90
|
+
if pidx == 0 and idx > 0:
|
91
|
+
# Step back to find the right bonus point
|
92
|
+
first_idx = idx - 1
|
93
|
+
idx += 1
|
94
|
+
return first_idx
|
95
|
+
|
96
|
+
|
97
|
+
def charClassOfAscii(char):
|
98
|
+
if char >= 'a' and char <= 'z':
|
99
|
+
return charLower
|
100
|
+
elif char >= 'A' and char <= 'Z':
|
101
|
+
return charUpper
|
102
|
+
elif char >= '0' and char <= '9':
|
103
|
+
return charNumber
|
104
|
+
elif char in whiteChars:
|
105
|
+
return charWhite
|
106
|
+
elif char in delimiterChars:
|
107
|
+
return charDelimiter
|
108
|
+
return charNonWord
|
109
|
+
|
110
|
+
|
111
|
+
def bonusFor(prevClass, class_):
|
112
|
+
if class_ > charNonWord:
|
113
|
+
if prevClass == charWhite:
|
114
|
+
# Word boundary after whitespace
|
115
|
+
return bonusBoundaryWhite
|
116
|
+
elif prevClass == charDelimiter:
|
117
|
+
# Word boundary after a delimiter character
|
118
|
+
return bonusBoundaryDelimiter
|
119
|
+
elif prevClass == charNonWord:
|
120
|
+
# Word boundary
|
121
|
+
return bonusBoundary
|
122
|
+
if (
|
123
|
+
prevClass == charLower
|
124
|
+
and class_ == charUpper
|
125
|
+
or prevClass != charNumber
|
126
|
+
and class_ == charNumber
|
127
|
+
):
|
128
|
+
# camelCase letter123
|
129
|
+
return bonusCamel123
|
130
|
+
elif class_ == charNonWord:
|
131
|
+
return bonusNonWord
|
132
|
+
elif class_ == charWhite:
|
133
|
+
return bonusBoundaryWhite
|
134
|
+
return 0
|
135
|
+
|
136
|
+
|
137
|
+
def debugV2(T, pattern, F, lastIdx, H, C):
|
138
|
+
'''Visualize the score matrix and matching positions.'''
|
139
|
+
width = lastIdx - F[0] + 1
|
140
|
+
|
141
|
+
for i, f in enumerate(F):
|
142
|
+
I = i * width
|
143
|
+
if i == 0:
|
144
|
+
print(' ', end='')
|
145
|
+
for j in range(f, lastIdx + 1):
|
146
|
+
print(f' {T[j]} ', end='')
|
147
|
+
print()
|
148
|
+
print(pattern[i] + ' ', end='')
|
149
|
+
for idx in range(F[0], f):
|
150
|
+
print(' 0 ', end='')
|
151
|
+
for idx in range(f, lastIdx + 1):
|
152
|
+
print(f'{int(H[i*width+idx-int(F[0])]):2d} ', end='')
|
153
|
+
print()
|
154
|
+
|
155
|
+
print(' ', end='')
|
156
|
+
for idx, p in enumerate(C[I : I + width]):
|
157
|
+
if idx + int(F[0]) < int(F[i]):
|
158
|
+
p = 0
|
159
|
+
if p > 0:
|
160
|
+
print(f'{p:2d} ', end='')
|
161
|
+
else:
|
162
|
+
print(' ', end='')
|
163
|
+
print()
|
164
|
+
|
165
|
+
|
166
|
+
@dataclass
|
167
|
+
class MatchResult:
|
168
|
+
'''Represents a scored match of a fuzzymatching search.
|
169
|
+
|
170
|
+
start: starting index of where the pattern is in the target sequence
|
171
|
+
end: Similarly, the end index (exclusive)
|
172
|
+
score: A value of how good the match is.
|
173
|
+
positions: A list of indices, indexing into the target sequence.
|
174
|
+
Corresponds to every position a letter of the pattern was found
|
175
|
+
for this particular alignment.
|
176
|
+
'''
|
177
|
+
|
178
|
+
start: int
|
179
|
+
end: int
|
180
|
+
score: int
|
181
|
+
positions: 'list[int]'
|
182
|
+
|
183
|
+
|
184
|
+
def _fuzzymatch(target: str, pattern: str) -> MatchResult:
|
185
|
+
'''Fuzzy string matching algorithm.
|
186
|
+
|
187
|
+
For a target sequence, check whether (and how good) a pattern is matching.
|
188
|
+
|
189
|
+
Returns a MatchResult, which contains start and end index of the match,
|
190
|
+
a score, and the positions where the pattern occurred.
|
191
|
+
|
192
|
+
The matching is case sensitive, so it's necessary to lower input and pattern
|
193
|
+
in the caller, if preferred otherwise.
|
194
|
+
|
195
|
+
The functionality is based on fzf's FuzzyMatchV2, minus some advanced features.
|
196
|
+
'''
|
197
|
+
patternLength = len(pattern)
|
198
|
+
if patternLength == 0:
|
199
|
+
return MatchResult(0, 0, 0, [])
|
200
|
+
targetLength = len(target)
|
201
|
+
|
202
|
+
# Phase 1: Optimized search for ASCII string
|
203
|
+
idx = asciiFuzzyIndex(target, pattern)
|
204
|
+
if idx < 0:
|
205
|
+
return MatchResult(-1, -1, 0, None)
|
206
|
+
|
207
|
+
H0 = [0] * targetLength
|
208
|
+
C0 = [0] * targetLength
|
209
|
+
# Bonus point for each position
|
210
|
+
B = [0] * targetLength
|
211
|
+
# The first occurrence of each character in the pattern
|
212
|
+
F = [0] * patternLength
|
213
|
+
T = list(target)
|
214
|
+
|
215
|
+
# Phase 2: Calculate bonus for each point
|
216
|
+
maxScore, maxScorePos = 0, 0
|
217
|
+
pidx, lastIdx = 0, 0
|
218
|
+
pchar0, pchar, prevH0, prevClass, inGap = (
|
219
|
+
pattern[0],
|
220
|
+
pattern[0],
|
221
|
+
0,
|
222
|
+
initialCharClass,
|
223
|
+
False,
|
224
|
+
)
|
225
|
+
Tsub = T[idx:]
|
226
|
+
H0sub, C0sub, Bsub = H0[idx:], C0[idx:], B[idx:]
|
227
|
+
|
228
|
+
for off, char in enumerate(Tsub):
|
229
|
+
class_ = charClassOfAscii(char)
|
230
|
+
bonus = bonusFor(prevClass, class_)
|
231
|
+
Bsub[off] = bonus
|
232
|
+
prevClass = class_
|
233
|
+
|
234
|
+
if char == pchar:
|
235
|
+
if pidx < patternLength:
|
236
|
+
F[pidx] = idx + off
|
237
|
+
pidx += 1
|
238
|
+
pchar = pattern[min(pidx, patternLength - 1)]
|
239
|
+
lastIdx = idx + off
|
240
|
+
|
241
|
+
if char == pchar0:
|
242
|
+
score = scoreMatch + bonus * bonusFirstCharMultiplier
|
243
|
+
H0sub[off] = score
|
244
|
+
C0sub[off] = 1
|
245
|
+
if patternLength == 1 and (score > maxScore):
|
246
|
+
maxScore, maxScorePos = score, idx + off
|
247
|
+
if bonus >= bonusBoundary:
|
248
|
+
break
|
249
|
+
inGap = False
|
250
|
+
else:
|
251
|
+
if inGap:
|
252
|
+
H0sub[off] = max(prevH0 + scoreGapExtension, 0)
|
253
|
+
else:
|
254
|
+
H0sub[off] = max(prevH0 + scoreGapStart, 0)
|
255
|
+
C0sub[off] = 0
|
256
|
+
inGap = True
|
257
|
+
prevH0 = H0sub[off]
|
258
|
+
|
259
|
+
# write back, because slices in python are a full copy (as opposed to go)
|
260
|
+
H0[idx:], C0[idx:], B[idx:] = H0sub, C0sub, Bsub
|
261
|
+
|
262
|
+
if pidx != patternLength:
|
263
|
+
return MatchResult(-1, -1, 0, None)
|
264
|
+
if patternLength == 1:
|
265
|
+
return MatchResult(maxScorePos, maxScorePos + 1, maxScore, [maxScorePos])
|
266
|
+
|
267
|
+
# Phase 3: Fill in score matrix (H)
|
268
|
+
# do not allow omission.
|
269
|
+
f0 = F[0]
|
270
|
+
width = lastIdx - f0 + 1
|
271
|
+
H = [0] * width * patternLength
|
272
|
+
H[:width] = list(H0[f0 : lastIdx + 1])
|
273
|
+
|
274
|
+
# Possible length of consecutive chunk at each position.
|
275
|
+
C = [0] * width * patternLength
|
276
|
+
C[:width] = C0[f0 : lastIdx + 1]
|
277
|
+
|
278
|
+
Fsub = F[1:]
|
279
|
+
Psub = pattern[1:]
|
280
|
+
for off, f in enumerate(Fsub):
|
281
|
+
pchar = Psub[off]
|
282
|
+
pidx = off + 1
|
283
|
+
row = pidx * width
|
284
|
+
inGap = False
|
285
|
+
Tsub = T[f : lastIdx + 1]
|
286
|
+
Bsub = B[f:][: len(Tsub)]
|
287
|
+
H[row + f - f0 - 1] = 0
|
288
|
+
for off, char in enumerate(Tsub):
|
289
|
+
Cdiag = C[row + f - f0 - 1 - width :][: len(Tsub)]
|
290
|
+
Hleft = H[row + f - f0 - 1 :][: len(Tsub)]
|
291
|
+
Hdiag = H[row + f - f0 - 1 - width :][: len(Tsub)]
|
292
|
+
col = off + f
|
293
|
+
s1, s2, consecutive = 0, 0, 0
|
294
|
+
|
295
|
+
if inGap:
|
296
|
+
s2 = Hleft[off] + scoreGapExtension
|
297
|
+
else:
|
298
|
+
s2 = Hleft[off] + scoreGapStart
|
299
|
+
|
300
|
+
if pchar == char:
|
301
|
+
s1 = Hdiag[off] + scoreMatch
|
302
|
+
b = Bsub[off]
|
303
|
+
consecutive = Cdiag[off] + 1
|
304
|
+
if consecutive > 1:
|
305
|
+
fb = B[col - consecutive + 1]
|
306
|
+
# Break consecutive chunk
|
307
|
+
if b >= bonusBoundary and b > fb:
|
308
|
+
consecutive = 1
|
309
|
+
else:
|
310
|
+
b = max(b, max(bonusConsecutive, fb))
|
311
|
+
if s1 + b < s2:
|
312
|
+
s1 += Bsub[off]
|
313
|
+
consecutive = 0
|
314
|
+
else:
|
315
|
+
s1 += b
|
316
|
+
C[row + f - f0 + off] = consecutive
|
317
|
+
|
318
|
+
inGap = s1 < s2
|
319
|
+
score = max(max(s1, s2), 0)
|
320
|
+
if pidx == patternLength - 1 and score > maxScore:
|
321
|
+
maxScore, maxScorePos = score, col
|
322
|
+
H[row + f - f0 + off] = score
|
323
|
+
|
324
|
+
if DEBUG:
|
325
|
+
debugV2(T, pattern, F, lastIdx, H, C)
|
326
|
+
|
327
|
+
# Phase 4. (Optional) Backtrace to find character positions
|
328
|
+
pos = []
|
329
|
+
i = patternLength - 1
|
330
|
+
j = maxScorePos
|
331
|
+
preferMatch = True
|
332
|
+
while True:
|
333
|
+
I = i * width
|
334
|
+
j0 = j - f0
|
335
|
+
s = H[I + j0]
|
336
|
+
|
337
|
+
s1, s2 = 0, 0
|
338
|
+
if i > 0 and j >= int(F[i]):
|
339
|
+
s1 = H[I - width + j0 - 1]
|
340
|
+
if j > int(F[i]):
|
341
|
+
s2 = H[I + j0 - 1]
|
342
|
+
|
343
|
+
if s > s1 and (s > s2 or s == s2 and preferMatch):
|
344
|
+
pos.append(j)
|
345
|
+
if i == 0:
|
346
|
+
break
|
347
|
+
i -= 1
|
348
|
+
preferMatch = (
|
349
|
+
C[I + j0] > 1 or I + width + j0 + 1 < len(C) and C[I + width + j0 + 1] > 0
|
350
|
+
)
|
351
|
+
j -= 1
|
352
|
+
|
353
|
+
# Start offset we return here is only relevant when begin tiebreak is used.
|
354
|
+
# However finding the accurate offset requires backtracking, and we don't
|
355
|
+
# want to pay extra cost for the option that has lost its importance.
|
356
|
+
return MatchResult(j, maxScorePos + 1, int(maxScore), pos)
|
357
|
+
|
358
|
+
|
359
|
+
def _format_match(s, positions):
|
360
|
+
out = list(s)
|
361
|
+
for p in positions:
|
362
|
+
out[p] = f'[:match]{out[p]}[/]'
|
363
|
+
return "".join(out)
|
364
|
+
|
365
|
+
CombinedMatch = collections.namedtuple('CombinedMatch', 'score formatted match')
|
366
|
+
|
367
|
+
|
368
|
+
@VisiData.api
|
369
|
+
def fuzzymatch(vd, haystack:"list[dict[str, str]]", needles:"list[str]) -> list[CombinedMatch]"):
|
370
|
+
'Return sorted list of matching dict values in haystack, augmenting the input dicts with _score:int and _positions:dict[k,set[int]] where k is each non-_ key in the haystack dict.'
|
371
|
+
|
372
|
+
matches = []
|
373
|
+
for h in haystack:
|
374
|
+
match = {}
|
375
|
+
formatted_hay = {}
|
376
|
+
for k, v in h.items():
|
377
|
+
for p in needles:
|
378
|
+
mr = _fuzzymatch(v, p)
|
379
|
+
if mr.score > 0:
|
380
|
+
match[k] = mr
|
381
|
+
formatted_hay[k] = _format_match(v, mr.positions)
|
382
|
+
|
383
|
+
if match:
|
384
|
+
# square to prefer larger scores in a single haystack
|
385
|
+
score = int(sum(mr.score**2 for mr in match.values()))
|
386
|
+
matches.append(CombinedMatch(score=score, formatted=formatted_hay, match=h))
|
387
|
+
|
388
|
+
return sorted(matches, key=lambda m: -m.score)
|
389
|
+
|
390
|
+
|
391
|
+
@VisiData.api
|
392
|
+
def test_fuzzymatch(vd):
|
393
|
+
assert asciiFuzzyIndex('helo', 'h') == 0
|
394
|
+
assert asciiFuzzyIndex('helo', 'hlo') == 0
|
395
|
+
assert asciiFuzzyIndex('helo', 'e') == 0
|
396
|
+
assert asciiFuzzyIndex('helo', 'el') == 0
|
397
|
+
assert asciiFuzzyIndex('helo', 'eo') == 0
|
398
|
+
assert asciiFuzzyIndex('helo', 'l') == 1
|
399
|
+
assert asciiFuzzyIndex('helo', 'lo') == 1
|
400
|
+
assert asciiFuzzyIndex('helo', 'o') == 2
|
401
|
+
assert asciiFuzzyIndex('helo', 'ooh') == -1
|
402
|
+
|
403
|
+
assert charClassOfAscii('a') == charLower
|
404
|
+
assert charClassOfAscii('C') == charUpper
|
405
|
+
assert charClassOfAscii('2') == charNumber
|
406
|
+
assert charClassOfAscii(' ') == charWhite
|
407
|
+
assert charClassOfAscii(',') == charDelimiter
|
408
|
+
|
409
|
+
assert _fuzzymatch('hello', '') == MatchResult(0, 0, 0, [])
|
410
|
+
assert _fuzzymatch('hello', 'nono') == MatchResult(-1, -1, 0, None)
|
411
|
+
assert _fuzzymatch('hello', 'l') == MatchResult(2, 3, 16, [2])
|
412
|
+
assert _fuzzymatch('hello world', 'elo wo') == MatchResult(
|
413
|
+
1, 8, 127, [7, 6, 5, 4, 2, 1]
|
414
|
+
)
|