visidata 2.11.dev0__py3-none-any.whl → 3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (253) hide show
  1. visidata/__init__.py +72 -91
  2. visidata/_input.py +263 -44
  3. visidata/_open.py +84 -29
  4. visidata/_types.py +22 -4
  5. visidata/_urlcache.py +17 -4
  6. visidata/aggregators.py +65 -25
  7. visidata/apps/__init__.py +0 -0
  8. visidata/apps/vdsql/__about__.py +8 -0
  9. visidata/apps/vdsql/__init__.py +5 -0
  10. visidata/apps/vdsql/__main__.py +27 -0
  11. visidata/apps/vdsql/_ibis.py +748 -0
  12. visidata/apps/vdsql/bigquery.py +61 -0
  13. visidata/apps/vdsql/clickhouse.py +53 -0
  14. visidata/apps/vdsql/setup.py +40 -0
  15. visidata/apps/vdsql/snowflake.py +67 -0
  16. visidata/apps/vgit/__init__.py +13 -0
  17. visidata/apps/vgit/__main__.py +3 -0
  18. visidata/apps/vgit/abort.py +23 -0
  19. visidata/apps/vgit/blame.py +76 -0
  20. visidata/apps/vgit/branch.py +153 -0
  21. visidata/apps/vgit/config.py +95 -0
  22. visidata/apps/vgit/diff.py +169 -0
  23. visidata/apps/vgit/gitsheet.py +161 -0
  24. visidata/apps/vgit/grep.py +37 -0
  25. visidata/apps/vgit/log.py +81 -0
  26. visidata/apps/vgit/main.py +55 -0
  27. visidata/apps/vgit/remote.py +57 -0
  28. visidata/apps/vgit/repos.py +71 -0
  29. visidata/apps/vgit/setup.py +37 -0
  30. visidata/apps/vgit/stash.py +69 -0
  31. visidata/apps/vgit/status.py +204 -0
  32. visidata/apps/vgit/statusbar.py +34 -0
  33. visidata/basesheet.py +59 -50
  34. visidata/canvas.py +251 -99
  35. visidata/choose.py +15 -11
  36. visidata/clean_names.py +29 -0
  37. visidata/clipboard.py +84 -18
  38. visidata/cliptext.py +220 -46
  39. visidata/cmdlog.py +89 -114
  40. visidata/color.py +142 -56
  41. visidata/column.py +134 -131
  42. visidata/ddw/input.ddw +74 -79
  43. visidata/ddw/regex.ddw +57 -0
  44. visidata/ddwplay.py +33 -14
  45. visidata/deprecated.py +77 -3
  46. visidata/desktop/visidata.desktop +7 -0
  47. visidata/editor.py +12 -6
  48. visidata/errors.py +5 -1
  49. visidata/experimental/__init__.py +0 -0
  50. visidata/experimental/diff_sheet.py +29 -0
  51. visidata/experimental/digit_autoedit.py +6 -0
  52. visidata/experimental/gdrive.py +89 -0
  53. visidata/experimental/google.py +37 -0
  54. visidata/experimental/gsheets.py +79 -0
  55. visidata/experimental/live_search.py +37 -0
  56. visidata/experimental/liveupdate.py +45 -0
  57. visidata/experimental/mark.py +133 -0
  58. visidata/experimental/noahs_tapestry/__init__.py +1 -0
  59. visidata/experimental/noahs_tapestry/tapestry.py +147 -0
  60. visidata/experimental/rownum.py +73 -0
  61. visidata/experimental/slide_cells.py +26 -0
  62. visidata/expr.py +8 -4
  63. visidata/extensible.py +32 -6
  64. visidata/features/__init__.py +0 -0
  65. visidata/features/addcol_audiometadata.py +42 -0
  66. visidata/features/addcol_histogram.py +34 -0
  67. visidata/features/canvas_save_svg.py +69 -0
  68. visidata/features/change_precision.py +46 -0
  69. visidata/features/cmdpalette.py +163 -0
  70. visidata/features/colorbrewer.py +363 -0
  71. visidata/{colorsheet.py → features/colorsheet.py} +17 -16
  72. visidata/features/command_server.py +105 -0
  73. visidata/features/currency_to_usd.py +70 -0
  74. visidata/{customdate.py → features/customdate.py} +2 -0
  75. visidata/features/dedupe.py +132 -0
  76. visidata/{describe.py → features/describe.py} +17 -15
  77. visidata/features/errors_guide.py +26 -0
  78. visidata/features/expand_cols.py +202 -0
  79. visidata/{fill.py → features/fill.py} +4 -2
  80. visidata/{freeze.py → features/freeze.py} +11 -6
  81. visidata/features/graph_seaborn.py +79 -0
  82. visidata/features/helloworld.py +10 -0
  83. visidata/features/hint_types.py +17 -0
  84. visidata/{incr.py → features/incr.py} +5 -0
  85. visidata/{join.py → features/join.py} +107 -53
  86. visidata/features/known_cols.py +21 -0
  87. visidata/features/layout.py +62 -0
  88. visidata/{melt.py → features/melt.py} +33 -21
  89. visidata/features/normcol.py +118 -0
  90. visidata/features/open_config.py +7 -0
  91. visidata/features/open_syspaste.py +18 -0
  92. visidata/features/ping.py +157 -0
  93. visidata/features/procmgr.py +208 -0
  94. visidata/features/random_sample.py +6 -0
  95. visidata/{regex.py → features/regex.py} +47 -31
  96. visidata/features/reload_every.py +55 -0
  97. visidata/features/rename_col_cascade.py +30 -0
  98. visidata/features/scroll_context.py +60 -0
  99. visidata/features/select_equal_selected.py +11 -0
  100. visidata/features/setcol_fake.py +65 -0
  101. visidata/{slide.py → features/slide.py} +75 -21
  102. visidata/features/sparkline.py +48 -0
  103. visidata/features/status_source.py +20 -0
  104. visidata/{sysedit.py → features/sysedit.py} +2 -1
  105. visidata/features/sysopen_mailcap.py +46 -0
  106. visidata/features/term_extras.py +13 -0
  107. visidata/{transpose.py → features/transpose.py} +5 -4
  108. visidata/features/type_ipaddr.py +73 -0
  109. visidata/features/type_url.py +11 -0
  110. visidata/{unfurl.py → features/unfurl.py} +9 -9
  111. visidata/{window.py → features/window.py} +2 -2
  112. visidata/form.py +50 -21
  113. visidata/freqtbl.py +81 -33
  114. visidata/fuzzymatch.py +414 -0
  115. visidata/graph.py +105 -33
  116. visidata/guide.py +180 -0
  117. visidata/help.py +75 -44
  118. visidata/hint.py +39 -0
  119. visidata/indexsheet.py +109 -0
  120. visidata/input_history.py +55 -0
  121. visidata/interface.py +58 -0
  122. visidata/keys.py +17 -16
  123. visidata/loaders/__init__.py +9 -0
  124. visidata/loaders/_pandas.py +61 -21
  125. visidata/loaders/api_airtable.py +70 -0
  126. visidata/loaders/api_bitio.py +102 -0
  127. visidata/loaders/api_matrix.py +148 -0
  128. visidata/loaders/api_reddit.py +306 -0
  129. visidata/loaders/api_zulip.py +249 -0
  130. visidata/loaders/archive.py +41 -7
  131. visidata/loaders/arrow.py +7 -7
  132. visidata/loaders/conll.py +49 -0
  133. visidata/loaders/csv.py +25 -7
  134. visidata/loaders/eml.py +3 -4
  135. visidata/loaders/f5log.py +1204 -0
  136. visidata/loaders/fec.py +325 -0
  137. visidata/loaders/fixed_width.py +3 -5
  138. visidata/loaders/frictionless.py +3 -3
  139. visidata/loaders/geojson.py +8 -5
  140. visidata/loaders/google.py +48 -0
  141. visidata/loaders/graphviz.py +4 -4
  142. visidata/loaders/hdf5.py +4 -4
  143. visidata/loaders/html.py +48 -10
  144. visidata/loaders/http.py +84 -30
  145. visidata/loaders/imap.py +20 -10
  146. visidata/loaders/jrnl.py +52 -0
  147. visidata/loaders/json.py +83 -29
  148. visidata/loaders/jsonla.py +74 -0
  149. visidata/loaders/lsv.py +15 -11
  150. visidata/loaders/mailbox.py +40 -0
  151. visidata/loaders/markdown.py +1 -3
  152. visidata/loaders/mbtiles.py +4 -5
  153. visidata/loaders/mysql.py +11 -13
  154. visidata/loaders/npy.py +7 -7
  155. visidata/loaders/odf.py +4 -1
  156. visidata/loaders/orgmode.py +428 -0
  157. visidata/loaders/pandas_freqtbl.py +14 -20
  158. visidata/loaders/parquet.py +62 -6
  159. visidata/loaders/pcap.py +3 -3
  160. visidata/loaders/pdf.py +4 -3
  161. visidata/loaders/png.py +19 -13
  162. visidata/loaders/postgres.py +9 -8
  163. visidata/loaders/rec.py +7 -3
  164. visidata/loaders/s3.py +342 -0
  165. visidata/loaders/sas.py +5 -5
  166. visidata/loaders/scrape.py +186 -0
  167. visidata/loaders/shp.py +6 -5
  168. visidata/loaders/spss.py +5 -6
  169. visidata/loaders/sqlite.py +68 -28
  170. visidata/loaders/texttables.py +1 -1
  171. visidata/loaders/toml.py +60 -0
  172. visidata/loaders/tsv.py +61 -19
  173. visidata/loaders/ttf.py +19 -7
  174. visidata/loaders/unzip_http.py +6 -5
  175. visidata/loaders/usv.py +1 -1
  176. visidata/loaders/vcf.py +16 -16
  177. visidata/loaders/vds.py +10 -7
  178. visidata/loaders/vdx.py +30 -5
  179. visidata/loaders/xlsb.py +8 -1
  180. visidata/loaders/xlsx.py +145 -25
  181. visidata/loaders/xml.py +6 -3
  182. visidata/loaders/xword.py +4 -4
  183. visidata/loaders/yaml.py +15 -5
  184. visidata/macos.py +1 -1
  185. visidata/macros.py +130 -41
  186. visidata/main.py +119 -94
  187. visidata/mainloop.py +101 -154
  188. visidata/man/parse_options.py +2 -2
  189. visidata/man/vd.1 +302 -147
  190. visidata/man/vd.txt +291 -151
  191. visidata/memory.py +3 -3
  192. visidata/menu.py +104 -423
  193. visidata/metasheets.py +59 -141
  194. visidata/modify.py +79 -23
  195. visidata/motd.py +3 -3
  196. visidata/mouse.py +137 -0
  197. visidata/movement.py +43 -35
  198. visidata/optionssheet.py +99 -0
  199. visidata/path.py +131 -43
  200. visidata/pivot.py +74 -47
  201. visidata/plugins.py +65 -192
  202. visidata/pyobj.py +50 -201
  203. visidata/rename_col.py +20 -0
  204. visidata/save.py +42 -20
  205. visidata/search.py +54 -10
  206. visidata/selection.py +84 -5
  207. visidata/settings.py +162 -24
  208. visidata/sheets.py +229 -257
  209. visidata/shell.py +51 -21
  210. visidata/sidebar.py +162 -0
  211. visidata/sort.py +11 -4
  212. visidata/statusbar.py +113 -104
  213. visidata/stored_list.py +43 -0
  214. visidata/stored_prop.py +38 -0
  215. visidata/tests/conftest.py +3 -3
  216. visidata/tests/test_cliptext.py +39 -0
  217. visidata/tests/test_commands.py +62 -7
  218. visidata/tests/test_edittext.py +2 -2
  219. visidata/tests/test_features.py +17 -0
  220. visidata/tests/test_menu.py +14 -0
  221. visidata/tests/test_path.py +13 -4
  222. visidata/text_source.py +53 -0
  223. visidata/textsheet.py +10 -3
  224. visidata/theme.py +44 -0
  225. visidata/themes/__init__.py +0 -0
  226. visidata/themes/ascii8.py +84 -0
  227. visidata/themes/asciimono.py +84 -0
  228. visidata/themes/light.py +17 -0
  229. visidata/threads.py +87 -39
  230. visidata/tuiwin.py +22 -0
  231. visidata/type_currency.py +22 -3
  232. visidata/type_date.py +31 -9
  233. visidata/type_floatsi.py +5 -1
  234. visidata/undo.py +18 -6
  235. visidata/utils.py +106 -23
  236. visidata/vdobj.py +28 -17
  237. visidata/windows.py +10 -0
  238. visidata/wrappers.py +9 -3
  239. visidata-3.0.data/data/share/applications/visidata.desktop +7 -0
  240. {visidata-2.11.dev0.data → visidata-3.0.data}/data/share/man/man1/vd.1 +302 -147
  241. {visidata-2.11.dev0.data → visidata-3.0.data}/data/share/man/man1/visidata.1 +302 -147
  242. visidata-3.0.data/scripts/vd2to3.vdx +9 -0
  243. {visidata-2.11.dev0.dist-info → visidata-3.0.dist-info}/METADATA +13 -11
  244. visidata-3.0.dist-info/RECORD +257 -0
  245. {visidata-2.11.dev0.dist-info → visidata-3.0.dist-info}/WHEEL +1 -1
  246. {visidata-2.11.dev0.dist-info → visidata-3.0.dist-info}/entry_points.txt +0 -1
  247. visidata/layout.py +0 -44
  248. visidata/misc.py +0 -5
  249. visidata-2.11.dev0.dist-info/RECORD +0 -142
  250. /visidata/{repeat.py → features/repeat.py} +0 -0
  251. {visidata-2.11.dev0.data → visidata-3.0.data}/scripts/vd +0 -0
  252. {visidata-2.11.dev0.dist-info → visidata-3.0.dist-info}/LICENSE.gpl3 +0 -0
  253. {visidata-2.11.dev0.dist-info → visidata-3.0.dist-info}/top_level.txt +0 -0
visidata/freqtbl.py CHANGED
@@ -5,8 +5,7 @@ from visidata import vd, asyncthread, vlen, VisiData, Column, AttrColumn, Sheet,
5
5
  from visidata.pivot import PivotSheet, PivotGroupRow
6
6
 
7
7
 
8
- vd.option('disp_histogram', '*', 'histogram element character')
9
- vd.option('disp_histolen', 50, 'width of histogram column')
8
+ vd.theme_option('disp_histogram', '', 'histogram element character')
10
9
  vd.option('histogram_bins', 0, 'number of bins for histogram of numeric columns')
11
10
  vd.option('numeric_binning', False, 'bin numeric columns into ranges', replay=True)
12
11
 
@@ -14,7 +13,7 @@ vd.option('numeric_binning', False, 'bin numeric columns into ranges', replay=Tr
14
13
  @VisiData.api
15
14
  def valueNames(vd, discrete_vals, numeric_vals):
16
15
  ret = [ '+'.join(str(x) for x in discrete_vals) ]
17
- if numeric_vals != (0, 0):
16
+ if isinstance(numeric_vals, tuple) and numeric_vals != (0, 0):
18
17
  ret.append('%s-%s' % numeric_vals)
19
18
 
20
19
  return '+'.join(ret)
@@ -22,18 +21,37 @@ def valueNames(vd, discrete_vals, numeric_vals):
22
21
  class HistogramColumn(Column):
23
22
  def calcValue(col, row):
24
23
  histogram = col.sheet.options.disp_histogram
25
- histolen = col.sheet.options.disp_histolen
24
+ histolen = col.width-2
26
25
  return histogram*(histolen*len(row.sourcerows)//col.sheet.largest)
27
26
 
28
27
 
28
+ def makeFreqTable(sheet, *groupByCols):
29
+ return FreqTableSheet(sheet.name,
30
+ '%s_freq' % '-'.join(col.name for col in groupByCols),
31
+ groupByCols=groupByCols,
32
+ source=sheet)
33
+
34
+
29
35
  class FreqTableSheet(PivotSheet):
30
36
  'Generate frequency-table sheet on currently selected column.'
37
+ guide = '''# Frequency Table
38
+ This is a *frequency analysis* of _{sheet.groupByColsName}_ from the *{sheet.groupByCols[0].sheet}* sheet.
39
+
40
+ Each row on this sheet corresponds to a *bin* of rows on the source sheet that have a distinct value. The _count_ and _percent_ columns show how many rows on the source sheet are in this bin.
41
+
42
+ - `Enter` to open a copy of the source sheet, with only the rows in the current bin.
43
+ - `g Enter` to open a copy of the source sheet, with a combination of the rows from all selected bins.
44
+
45
+ ## Tips
46
+
47
+ - Use `+` on the source sheet, to add aggregators on other columns, and those metrics will appear as separate columns here.
48
+ - Selecting bins on this sheet will select those rows on the source sheet.
49
+ '''
31
50
  rowtype = 'bins' # rowdef FreqRow(keys, sourcerows)
32
51
 
33
- def __init__(self, sheet, *groupByCols):
34
- fqcolname = '%s_%s_freq' % (sheet.name, '-'.join(col.name for col in groupByCols))
35
- super().__init__(fqcolname, groupByCols, [], source=sheet)
36
- self.largest = 1
52
+ @property
53
+ def groupByColsName(self):
54
+ return '+'.join(c.name for c in self.groupByCols)
37
55
 
38
56
  def selectRow(self, row):
39
57
  self.source.select(row.sourcerows) # select all entries in the bin on the source sheet
@@ -46,10 +64,8 @@ class FreqTableSheet(PivotSheet):
46
64
  def updateLargest(self, grouprow):
47
65
  self.largest = max(self.largest, len(grouprow.sourcerows))
48
66
 
49
- @asyncthread
50
- def reload(self):
51
- 'Generate frequency table then reverse-sort by length.'
52
- super().initCols()
67
+ def resetCols(self):
68
+ super().resetCols()
53
69
 
54
70
  # add default bonus columns
55
71
  for c in [
@@ -58,34 +74,39 @@ class FreqTableSheet(PivotSheet):
58
74
  ]:
59
75
  self.addColumn(c)
60
76
 
61
- if self.options.disp_histolen and self.options.disp_histogram:
62
- c = HistogramColumn('histogram', type=str, width=self.options.disp_histolen+2)
77
+ if self.options.disp_histogram:
78
+ c = HistogramColumn('histogram', type=str, width=self.options.default_width*2)
63
79
  self.addColumn(c)
64
80
 
81
+ # if non-numeric grouping, reverse sort by count at end of load
82
+ if not any(vd.isNumeric(c) for c in self.groupByCols):
83
+ self._ordering = [('count', True)]
84
+
85
+ def loader(self):
86
+ 'Generate frequency table.'
65
87
  # two more threads
66
88
  vd.sync(self.addAggregateCols(),
67
89
  self.groupRows(self.updateLargest))
68
90
 
91
+ def afterLoad(self):
92
+ super().afterLoad()
69
93
  if self.nCols > len(self.groupByCols)+3: # hide percent/histogram if aggregations added
70
94
  self.column('percent').hide()
71
95
  self.column('histogram').hide()
72
96
 
73
- if not [c for c in self.groupByCols if vd.isNumeric(c)]:
74
- self.orderBy(self.column('count'), reverse=True)
75
-
76
97
  def openRow(self, row):
77
98
  'open copy of source sheet with rows that are grouped in current row'
78
99
  if row.sourcerows:
79
100
  vs = copy(self.source)
80
- vs.name += "_"+vd.valueNames(row.discrete_keys, row.numeric_key)
101
+ vs.names = vs.names + [vd.valueNames(row.discrete_keys, row.numeric_key)]
81
102
  vs.rows=copy(row.sourcerows)
82
- vs.source=self.source
83
103
  return vs
84
104
  vd.warning("no source rows")
85
105
 
86
106
  def openRows(self, rows):
87
107
  vs = copy(self.source)
88
- vs.name += "_several"
108
+ vs.names = vs.names + ["several"]
109
+ vs.source = self
89
110
  vs.rows = list(itertools.chain.from_iterable(row.sourcerows for row in rows))
90
111
  return vs
91
112
 
@@ -94,23 +115,50 @@ class FreqTableSheet(PivotSheet):
94
115
 
95
116
 
96
117
  class FreqTableSheetSummary(FreqTableSheet):
97
- 'Append a PivotGroupRow to FreqTable with only selectedRows.'
98
- @asyncthread
99
- def reload(self):
100
- FreqTableSheet.reload.__wrapped__(self)
118
+ 'Append a PivotGroupRow to FreqTableSheet with only selectedRows.'
119
+ def afterLoad(self):
101
120
  self.addRow(PivotGroupRow(['Selected'], (0,0), self.source.selectedRows, {}))
121
+ super().afterLoad()
102
122
 
103
- Sheet.addCommand('F', 'freq-col', 'vd.push(FreqTableSheet(sheet, cursorCol))', 'open Frequency Table grouped on current column, with aggregations of other columns')
104
- Sheet.addCommand('gF', 'freq-keys', 'vd.push(FreqTableSheet(sheet, *keyCols))', 'open Frequency Table grouped by all key columns on source sheet, with aggregations of other columns')
105
- Sheet.addCommand('zF', 'freq-summary', 'vd.push(FreqTableSheetSummary(sheet, Column("Total", sheet=sheet, getter=lambda col, row: "Total")))', 'open one-line summary for all rows and selected rows')
106
123
 
107
- ColumnsSheet.addCommand(ENTER, 'freq-row', 'vd.push(FreqTableSheet(source[0], cursorRow))', 'open a Frequency Table sheet grouped on column referenced in current row')
124
+ def makeFreqTableSheetSummary(sheet, *groupByCols):
125
+ return FreqTableSheetSummary(sheet.name,
126
+ '%s_freq' % '-'.join(col.name for col in groupByCols),
127
+ groupByCols=groupByCols,
128
+ source=sheet)
129
+
130
+
131
+ @VisiData.api
132
+ class FreqTablePreviewSheet(Sheet):
133
+ @property
134
+ def rows(self):
135
+ return self.source.cursorRow.sourcerows
136
+
137
+
138
+ FreqTableSheet.addCommand('', 'open-preview', 'vd.push(FreqTablePreviewSheet(sheet.name, "preview", source=sheet, columns=source.columns), pane=2); vd.options.disp_splitwin_pct=50', 'open split preview of source rows at cursor')
139
+
140
+ Sheet.addCommand('F', 'freq-col', 'vd.push(makeFreqTable(sheet, cursorCol))', 'open Frequency Table grouped on current column, with aggregations of other columns')
141
+ Sheet.addCommand('gF', 'freq-keys', 'vd.push(makeFreqTable(sheet, *keyCols))', 'open Frequency Table grouped by all key columns on source sheet, with aggregations of other columns')
142
+ Sheet.addCommand('zF', 'freq-summary', 'vd.push(makeFreqTableSheetSummary(sheet, Column("Total", sheet=sheet, getter=lambda col, row: "Total")))', 'open one-line summary for all rows and selected rows')
143
+
144
+ ColumnsSheet.addCommand(ENTER, 'freq-row', 'vd.push(makeFreqTable(source[0], cursorRow))', 'open a Frequency Table sheet grouped on column referenced in current row')
108
145
  vd.addMenuItem('Data', 'Frequency table', 'current row', 'freq-row')
109
146
 
110
147
  FreqTableSheet.addCommand('gu', 'unselect-rows', 'unselect(selectedRows)', 'unselect all source rows grouped in current row')
111
148
  FreqTableSheet.addCommand('g'+ENTER, 'dive-selected', 'vd.push(openRows(selectedRows))', 'open copy of source sheet with rows that are grouped in selected rows')
112
-
113
- vd.addGlobals({
114
- 'FreqTableSheet': FreqTableSheet,
115
- 'FreqTableSheetSummary': FreqTableSheetSummary,
116
- })
149
+ FreqTableSheet.addCommand('', 'select-first', 'for r in rows: source.select([r.sourcerows[0]])', 'select first source row in each bin')
150
+
151
+ FreqTableSheet.init('largest', lambda: 1)
152
+
153
+ vd.addGlobals(
154
+ makeFreqTable=makeFreqTable,
155
+ makeFreqTableSheetSummary=makeFreqTableSheetSummary,
156
+ FreqTableSheet=FreqTableSheet,
157
+ FreqTableSheetSummary=FreqTableSheetSummary,
158
+ HistogramColumn=HistogramColumn,
159
+ )
160
+
161
+ vd.addMenuItems('''
162
+ Data > Frequency table > current column > freq-col
163
+ Data > Frequency table > key columns > freq-keys
164
+ ''')
visidata/fuzzymatch.py ADDED
@@ -0,0 +1,414 @@
1
+ ''' Fuzzy String Matching.
2
+
3
+ This module is a pretty verbatim Python port of fzf's FuzzyMatchV2
4
+ trimmed down to a basic usecase of matching ASCII strings case sensitively.
5
+
6
+ For more information check out the source, I have not bothered to copy
7
+ the introductory comment/documentation:
8
+
9
+ https://github.com/junegunn/fzf/blob/b1a0ab8086/src/algo/algo.go
10
+
11
+ '''
12
+ import collections
13
+ from dataclasses import dataclass
14
+ from enum import Enum
15
+ from visidata import VisiData, vd
16
+
17
+ # Overwrite to true to get some diagnostic visualization
18
+ DEBUG = False
19
+
20
+ scoreMatch = 16
21
+ scoreGapStart = -3
22
+ scoreGapExtension = -1
23
+
24
+ # We prefer matches at the beginning of a word, but the bonus should not be
25
+ # too great to prevent the longer acronym matches from always winning over
26
+ # shorter fuzzy matches. The bonus point here was specifically chosen that
27
+ # the bonus is cancelled when the gap between the acronyms grows over
28
+ # 8 characters, which is approximately the average length of the words found
29
+ # in web2 dictionary and my file system.
30
+ bonusBoundary = scoreMatch / 2
31
+
32
+ # Although bonus point for non-word characters is non-contextual, we need it
33
+ # for computing bonus points for consecutive chunks starting with a non-word
34
+ # character.
35
+ bonusNonWord = scoreMatch / 2
36
+
37
+ # Edge-triggered bonus for matches in camelCase words.
38
+ # Compared to word-boundary case, they don't accompany single-character gaps
39
+ # (e.g. FooBar vs. foo-bar), so we deduct bonus point accordingly.
40
+ bonusCamel123 = bonusBoundary + scoreGapExtension
41
+
42
+ # Minimum bonus point given to characters in consecutive chunks.
43
+ # Note that bonus points for consecutive matches shouldn't have needed if we
44
+ # used fixed match score as in the original algorithm.
45
+ bonusConsecutive = -(scoreGapStart + scoreGapExtension)
46
+
47
+ # The first character in the typed pattern usually has more significance
48
+ # than the rest so it's important that it appears at special positions where
49
+ # bonus points are given, e.g. 'to-go' vs. 'ongoing' on 'og' or on 'ogo'.
50
+ # The amount of the extra bonus should be limited so that the gap penalty is
51
+ # still respected.
52
+ bonusFirstCharMultiplier = 2
53
+
54
+ # Extra bonus for word boundary after whitespace character or beginning of the string
55
+ bonusBoundaryWhite = bonusBoundary + 2
56
+
57
+ # Extra bonus for word boundary after slash, colon, semi-colon, and comma
58
+ bonusBoundaryDelimiter = bonusBoundary + 1
59
+
60
+ delimiterChars = '/,:;|'
61
+
62
+ vd.theme_option('color_match', 'red', 'color for matching chars in palette chooser')
63
+
64
+ whiteChars = ' \t\n\v\f\r\x85\xA0'
65
+
66
+ (
67
+ charWhite,
68
+ charNonWord,
69
+ charDelimiter,
70
+ charLower,
71
+ charUpper,
72
+ charLetter,
73
+ charNumber,
74
+ ) = range(7)
75
+ initialCharClass = charWhite
76
+
77
+
78
+ def asciiFuzzyIndex(target, pattern):
79
+ '''Return a fuzzy* starting position of the pattern,
80
+ or -1, if pattern isn't a fuzzy match.
81
+
82
+ *the position is adapted one back, if possible,
83
+ for bonus determination reasons.
84
+ '''
85
+ first_idx, idx = 0, 0
86
+ for pidx in range(len(pattern)):
87
+ idx = target.find(pattern[pidx], idx)
88
+ if idx < 0:
89
+ return -1
90
+ if pidx == 0 and idx > 0:
91
+ # Step back to find the right bonus point
92
+ first_idx = idx - 1
93
+ idx += 1
94
+ return first_idx
95
+
96
+
97
+ def charClassOfAscii(char):
98
+ if char >= 'a' and char <= 'z':
99
+ return charLower
100
+ elif char >= 'A' and char <= 'Z':
101
+ return charUpper
102
+ elif char >= '0' and char <= '9':
103
+ return charNumber
104
+ elif char in whiteChars:
105
+ return charWhite
106
+ elif char in delimiterChars:
107
+ return charDelimiter
108
+ return charNonWord
109
+
110
+
111
+ def bonusFor(prevClass, class_):
112
+ if class_ > charNonWord:
113
+ if prevClass == charWhite:
114
+ # Word boundary after whitespace
115
+ return bonusBoundaryWhite
116
+ elif prevClass == charDelimiter:
117
+ # Word boundary after a delimiter character
118
+ return bonusBoundaryDelimiter
119
+ elif prevClass == charNonWord:
120
+ # Word boundary
121
+ return bonusBoundary
122
+ if (
123
+ prevClass == charLower
124
+ and class_ == charUpper
125
+ or prevClass != charNumber
126
+ and class_ == charNumber
127
+ ):
128
+ # camelCase letter123
129
+ return bonusCamel123
130
+ elif class_ == charNonWord:
131
+ return bonusNonWord
132
+ elif class_ == charWhite:
133
+ return bonusBoundaryWhite
134
+ return 0
135
+
136
+
137
+ def debugV2(T, pattern, F, lastIdx, H, C):
138
+ '''Visualize the score matrix and matching positions.'''
139
+ width = lastIdx - F[0] + 1
140
+
141
+ for i, f in enumerate(F):
142
+ I = i * width
143
+ if i == 0:
144
+ print(' ', end='')
145
+ for j in range(f, lastIdx + 1):
146
+ print(f' {T[j]} ', end='')
147
+ print()
148
+ print(pattern[i] + ' ', end='')
149
+ for idx in range(F[0], f):
150
+ print(' 0 ', end='')
151
+ for idx in range(f, lastIdx + 1):
152
+ print(f'{int(H[i*width+idx-int(F[0])]):2d} ', end='')
153
+ print()
154
+
155
+ print(' ', end='')
156
+ for idx, p in enumerate(C[I : I + width]):
157
+ if idx + int(F[0]) < int(F[i]):
158
+ p = 0
159
+ if p > 0:
160
+ print(f'{p:2d} ', end='')
161
+ else:
162
+ print(' ', end='')
163
+ print()
164
+
165
+
166
+ @dataclass
167
+ class MatchResult:
168
+ '''Represents a scored match of a fuzzymatching search.
169
+
170
+ start: starting index of where the pattern is in the target sequence
171
+ end: Similarly, the end index (exclusive)
172
+ score: A value of how good the match is.
173
+ positions: A list of indices, indexing into the target sequence.
174
+ Corresponds to every position a letter of the pattern was found
175
+ for this particular alignment.
176
+ '''
177
+
178
+ start: int
179
+ end: int
180
+ score: int
181
+ positions: 'list[int]'
182
+
183
+
184
+ def _fuzzymatch(target: str, pattern: str) -> MatchResult:
185
+ '''Fuzzy string matching algorithm.
186
+
187
+ For a target sequence, check whether (and how good) a pattern is matching.
188
+
189
+ Returns a MatchResult, which contains start and end index of the match,
190
+ a score, and the positions where the pattern occurred.
191
+
192
+ The matching is case sensitive, so it's necessary to lower input and pattern
193
+ in the caller, if preferred otherwise.
194
+
195
+ The functionality is based on fzf's FuzzyMatchV2, minus some advanced features.
196
+ '''
197
+ patternLength = len(pattern)
198
+ if patternLength == 0:
199
+ return MatchResult(0, 0, 0, [])
200
+ targetLength = len(target)
201
+
202
+ # Phase 1: Optimized search for ASCII string
203
+ idx = asciiFuzzyIndex(target, pattern)
204
+ if idx < 0:
205
+ return MatchResult(-1, -1, 0, None)
206
+
207
+ H0 = [0] * targetLength
208
+ C0 = [0] * targetLength
209
+ # Bonus point for each position
210
+ B = [0] * targetLength
211
+ # The first occurrence of each character in the pattern
212
+ F = [0] * patternLength
213
+ T = list(target)
214
+
215
+ # Phase 2: Calculate bonus for each point
216
+ maxScore, maxScorePos = 0, 0
217
+ pidx, lastIdx = 0, 0
218
+ pchar0, pchar, prevH0, prevClass, inGap = (
219
+ pattern[0],
220
+ pattern[0],
221
+ 0,
222
+ initialCharClass,
223
+ False,
224
+ )
225
+ Tsub = T[idx:]
226
+ H0sub, C0sub, Bsub = H0[idx:], C0[idx:], B[idx:]
227
+
228
+ for off, char in enumerate(Tsub):
229
+ class_ = charClassOfAscii(char)
230
+ bonus = bonusFor(prevClass, class_)
231
+ Bsub[off] = bonus
232
+ prevClass = class_
233
+
234
+ if char == pchar:
235
+ if pidx < patternLength:
236
+ F[pidx] = idx + off
237
+ pidx += 1
238
+ pchar = pattern[min(pidx, patternLength - 1)]
239
+ lastIdx = idx + off
240
+
241
+ if char == pchar0:
242
+ score = scoreMatch + bonus * bonusFirstCharMultiplier
243
+ H0sub[off] = score
244
+ C0sub[off] = 1
245
+ if patternLength == 1 and (score > maxScore):
246
+ maxScore, maxScorePos = score, idx + off
247
+ if bonus >= bonusBoundary:
248
+ break
249
+ inGap = False
250
+ else:
251
+ if inGap:
252
+ H0sub[off] = max(prevH0 + scoreGapExtension, 0)
253
+ else:
254
+ H0sub[off] = max(prevH0 + scoreGapStart, 0)
255
+ C0sub[off] = 0
256
+ inGap = True
257
+ prevH0 = H0sub[off]
258
+
259
+ # write back, because slices in python are a full copy (as opposed to go)
260
+ H0[idx:], C0[idx:], B[idx:] = H0sub, C0sub, Bsub
261
+
262
+ if pidx != patternLength:
263
+ return MatchResult(-1, -1, 0, None)
264
+ if patternLength == 1:
265
+ return MatchResult(maxScorePos, maxScorePos + 1, maxScore, [maxScorePos])
266
+
267
+ # Phase 3: Fill in score matrix (H)
268
+ # do not allow omission.
269
+ f0 = F[0]
270
+ width = lastIdx - f0 + 1
271
+ H = [0] * width * patternLength
272
+ H[:width] = list(H0[f0 : lastIdx + 1])
273
+
274
+ # Possible length of consecutive chunk at each position.
275
+ C = [0] * width * patternLength
276
+ C[:width] = C0[f0 : lastIdx + 1]
277
+
278
+ Fsub = F[1:]
279
+ Psub = pattern[1:]
280
+ for off, f in enumerate(Fsub):
281
+ pchar = Psub[off]
282
+ pidx = off + 1
283
+ row = pidx * width
284
+ inGap = False
285
+ Tsub = T[f : lastIdx + 1]
286
+ Bsub = B[f:][: len(Tsub)]
287
+ H[row + f - f0 - 1] = 0
288
+ for off, char in enumerate(Tsub):
289
+ Cdiag = C[row + f - f0 - 1 - width :][: len(Tsub)]
290
+ Hleft = H[row + f - f0 - 1 :][: len(Tsub)]
291
+ Hdiag = H[row + f - f0 - 1 - width :][: len(Tsub)]
292
+ col = off + f
293
+ s1, s2, consecutive = 0, 0, 0
294
+
295
+ if inGap:
296
+ s2 = Hleft[off] + scoreGapExtension
297
+ else:
298
+ s2 = Hleft[off] + scoreGapStart
299
+
300
+ if pchar == char:
301
+ s1 = Hdiag[off] + scoreMatch
302
+ b = Bsub[off]
303
+ consecutive = Cdiag[off] + 1
304
+ if consecutive > 1:
305
+ fb = B[col - consecutive + 1]
306
+ # Break consecutive chunk
307
+ if b >= bonusBoundary and b > fb:
308
+ consecutive = 1
309
+ else:
310
+ b = max(b, max(bonusConsecutive, fb))
311
+ if s1 + b < s2:
312
+ s1 += Bsub[off]
313
+ consecutive = 0
314
+ else:
315
+ s1 += b
316
+ C[row + f - f0 + off] = consecutive
317
+
318
+ inGap = s1 < s2
319
+ score = max(max(s1, s2), 0)
320
+ if pidx == patternLength - 1 and score > maxScore:
321
+ maxScore, maxScorePos = score, col
322
+ H[row + f - f0 + off] = score
323
+
324
+ if DEBUG:
325
+ debugV2(T, pattern, F, lastIdx, H, C)
326
+
327
+ # Phase 4. (Optional) Backtrace to find character positions
328
+ pos = []
329
+ i = patternLength - 1
330
+ j = maxScorePos
331
+ preferMatch = True
332
+ while True:
333
+ I = i * width
334
+ j0 = j - f0
335
+ s = H[I + j0]
336
+
337
+ s1, s2 = 0, 0
338
+ if i > 0 and j >= int(F[i]):
339
+ s1 = H[I - width + j0 - 1]
340
+ if j > int(F[i]):
341
+ s2 = H[I + j0 - 1]
342
+
343
+ if s > s1 and (s > s2 or s == s2 and preferMatch):
344
+ pos.append(j)
345
+ if i == 0:
346
+ break
347
+ i -= 1
348
+ preferMatch = (
349
+ C[I + j0] > 1 or I + width + j0 + 1 < len(C) and C[I + width + j0 + 1] > 0
350
+ )
351
+ j -= 1
352
+
353
+ # Start offset we return here is only relevant when begin tiebreak is used.
354
+ # However finding the accurate offset requires backtracking, and we don't
355
+ # want to pay extra cost for the option that has lost its importance.
356
+ return MatchResult(j, maxScorePos + 1, int(maxScore), pos)
357
+
358
+
359
+ def _format_match(s, positions):
360
+ out = list(s)
361
+ for p in positions:
362
+ out[p] = f'[:match]{out[p]}[/]'
363
+ return "".join(out)
364
+
365
+ CombinedMatch = collections.namedtuple('CombinedMatch', 'score formatted match')
366
+
367
+
368
+ @VisiData.api
369
+ def fuzzymatch(vd, haystack:"list[dict[str, str]]", needles:"list[str]) -> list[CombinedMatch]"):
370
+ 'Return sorted list of matching dict values in haystack, augmenting the input dicts with _score:int and _positions:dict[k,set[int]] where k is each non-_ key in the haystack dict.'
371
+
372
+ matches = []
373
+ for h in haystack:
374
+ match = {}
375
+ formatted_hay = {}
376
+ for k, v in h.items():
377
+ for p in needles:
378
+ mr = _fuzzymatch(v, p)
379
+ if mr.score > 0:
380
+ match[k] = mr
381
+ formatted_hay[k] = _format_match(v, mr.positions)
382
+
383
+ if match:
384
+ # square to prefer larger scores in a single haystack
385
+ score = int(sum(mr.score**2 for mr in match.values()))
386
+ matches.append(CombinedMatch(score=score, formatted=formatted_hay, match=h))
387
+
388
+ return sorted(matches, key=lambda m: -m.score)
389
+
390
+
391
+ @VisiData.api
392
+ def test_fuzzymatch(vd):
393
+ assert asciiFuzzyIndex('helo', 'h') == 0
394
+ assert asciiFuzzyIndex('helo', 'hlo') == 0
395
+ assert asciiFuzzyIndex('helo', 'e') == 0
396
+ assert asciiFuzzyIndex('helo', 'el') == 0
397
+ assert asciiFuzzyIndex('helo', 'eo') == 0
398
+ assert asciiFuzzyIndex('helo', 'l') == 1
399
+ assert asciiFuzzyIndex('helo', 'lo') == 1
400
+ assert asciiFuzzyIndex('helo', 'o') == 2
401
+ assert asciiFuzzyIndex('helo', 'ooh') == -1
402
+
403
+ assert charClassOfAscii('a') == charLower
404
+ assert charClassOfAscii('C') == charUpper
405
+ assert charClassOfAscii('2') == charNumber
406
+ assert charClassOfAscii(' ') == charWhite
407
+ assert charClassOfAscii(',') == charDelimiter
408
+
409
+ assert _fuzzymatch('hello', '') == MatchResult(0, 0, 0, [])
410
+ assert _fuzzymatch('hello', 'nono') == MatchResult(-1, -1, 0, None)
411
+ assert _fuzzymatch('hello', 'l') == MatchResult(2, 3, 16, [2])
412
+ assert _fuzzymatch('hello world', 'elo wo') == MatchResult(
413
+ 1, 8, 127, [7, 6, 5, 4, 2, 1]
414
+ )