kash-shell 0.3.25__py3-none-any.whl → 0.3.27__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. kash/actions/__init__.py +51 -6
  2. kash/actions/core/minify_html.py +2 -2
  3. kash/commands/base/general_commands.py +4 -2
  4. kash/commands/help/assistant_commands.py +4 -3
  5. kash/commands/help/welcome.py +1 -1
  6. kash/config/colors.py +7 -3
  7. kash/config/logger.py +4 -0
  8. kash/config/text_styles.py +1 -0
  9. kash/config/unified_live.py +249 -0
  10. kash/docs/markdown/assistant_instructions_template.md +3 -3
  11. kash/docs/markdown/topics/a1_what_is_kash.md +22 -20
  12. kash/docs/markdown/topics/a2_installation.md +10 -10
  13. kash/docs/markdown/topics/a3_getting_started.md +8 -8
  14. kash/docs/markdown/topics/a4_elements.md +3 -3
  15. kash/docs/markdown/topics/a5_tips_for_use_with_other_tools.md +12 -12
  16. kash/docs/markdown/topics/b0_philosophy_of_kash.md +17 -17
  17. kash/docs/markdown/topics/b1_kash_overview.md +7 -7
  18. kash/docs/markdown/topics/b2_workspace_and_file_formats.md +1 -1
  19. kash/docs/markdown/topics/b3_modern_shell_tool_recommendations.md +1 -1
  20. kash/docs/markdown/topics/b4_faq.md +7 -7
  21. kash/docs/markdown/welcome.md +1 -1
  22. kash/embeddings/embeddings.py +110 -39
  23. kash/embeddings/text_similarity.py +2 -2
  24. kash/exec/shell_callable_action.py +4 -3
  25. kash/help/help_embeddings.py +5 -2
  26. kash/mcp/mcp_server_sse.py +0 -5
  27. kash/model/graph_model.py +2 -0
  28. kash/model/items_model.py +4 -4
  29. kash/shell/output/shell_output.py +2 -2
  30. kash/shell/shell_main.py +64 -6
  31. kash/shell/version.py +18 -2
  32. kash/utils/file_utils/csv_utils.py +105 -0
  33. kash/utils/rich_custom/multitask_status.py +19 -5
  34. kash/web_gen/templates/base_styles.css.jinja +384 -31
  35. kash/web_gen/templates/base_webpage.html.jinja +43 -0
  36. kash/web_gen/templates/components/toc_styles.css.jinja +25 -4
  37. kash/web_gen/templates/components/tooltip_styles.css.jinja +2 -0
  38. kash/web_gen/templates/content_styles.css.jinja +23 -9
  39. kash/web_gen/templates/item_view.html.jinja +12 -4
  40. kash/web_gen/templates/simple_webpage.html.jinja +2 -2
  41. kash/xonsh_custom/custom_shell.py +6 -6
  42. {kash_shell-0.3.25.dist-info → kash_shell-0.3.27.dist-info}/METADATA +59 -56
  43. {kash_shell-0.3.25.dist-info → kash_shell-0.3.27.dist-info}/RECORD +46 -44
  44. {kash_shell-0.3.25.dist-info → kash_shell-0.3.27.dist-info}/WHEEL +0 -0
  45. {kash_shell-0.3.25.dist-info → kash_shell-0.3.27.dist-info}/entry_points.txt +0 -0
  46. {kash_shell-0.3.25.dist-info → kash_shell-0.3.27.dist-info}/licenses/LICENSE +0 -0
@@ -15,11 +15,11 @@ Type `help` for the full documentation.
15
15
  The simplest way to illustrate how to use kash is by example.
16
16
  You can go through the commands below a few at a time, trying each one.
17
17
 
18
- This is a "real" example that uses ffmpeg and a few other libraries.
19
- So to get it to work you must install not just the main shell but the kash "media kit"
18
+ This is a real example that uses ffmpeg and a few other libraries.
19
+ So to get it to work you must install not just the main shell but the kash media kit
20
20
  with extra dependencies.
21
21
  This is discussed in [the installation instructions](#installation-steps).
22
- If you don't have these already installed, you can add these tools:
22
+ If you dont have these already installed, you can add these tools:
23
23
 
24
24
  Then run `kash` to start.
25
25
 
@@ -170,8 +170,8 @@ All of these steps are just actions.
170
170
 
171
171
  ### Creating a New Workspace
172
172
 
173
- Although you don't always need one, a *workspace* is very helpful for any real work in
174
- kash. It's just a directory of files, plus a `.kash/` directory with various logs and
173
+ Although you dont always need one, a *workspace* is very helpful for any real work in
174
+ kash. Its just a directory of files, plus a `.kash/` directory with various logs and
175
175
  metadata.
176
176
 
177
177
  Note the `.kash/cache` directory contains all the downloaded videos and media you
@@ -192,7 +192,7 @@ By default, when you are not using the shell inside a workspace directory, or wh
192
192
  run kash the first time, it uses the default *global workspace*.
193
193
 
194
194
  Once you create a workspace, you can `cd` into that workspace and that will become the
195
- current workspace. (If you're familiar with how the `git` command-line works in
195
+ current workspace. (If youre familiar with how the `git` command-line works in
196
196
  conjunction with the `.git/` directory, this behavior is very similar.)
197
197
 
198
198
  To start a new workspace, run a command like
@@ -230,7 +230,7 @@ A few of the most important commands for managing files and work are these:
230
230
 
231
231
  - `workspace` shows or selects or creates a new workspace.
232
232
  Initially you work in the default global workspace (typically at `~/Kash/workspace`)
233
- but for more real work you'll want to create a workspace, which is a directory to hold
233
+ but for more real work youll want to create a workspace, which is a directory to hold
234
234
  the files you are working with.
235
235
 
236
236
  - `select` shows or sets selections, which are the set of files the next command will
@@ -244,7 +244,7 @@ A few of the most important commands for managing files and work are these:
244
244
 
245
245
  - `logs` to see full logs (typically more detailed than what you see in the console).
246
246
 
247
- - `history` to see recent commands you've run.
247
+ - `history` to see recent commands youve run.
248
248
 
249
249
  - `import_item` to add a resource such as a URL or a file to your local workspace.
250
250
 
@@ -2,7 +2,7 @@
2
2
 
3
3
  ### What is Included?
4
4
 
5
- I've tried to build independently useful pieces that fit together in a simple way:
5
+ Ive tried to build independently useful pieces that fit together in a simple way:
6
6
 
7
7
  - The kash **action framework**:
8
8
 
@@ -75,9 +75,9 @@ I've tried to build independently useful pieces that fit together in a simple wa
75
75
  OSC 8 links
76
76
 
77
77
  - Sadly, we may have mind-boggling AI tools, but Terminals are still incredibly
78
- archaic and don't support these features well (more on this below) but I have a new
78
+ archaic and dont support these features well (more on this below) but I have a new
79
79
  terminal, Kerm, that shows these as tooltips and makes every command clickable
80
- (please contact me if you'd like an early developer preview, as I'd love feedback)
80
+ (please contact me if youd like an early developer preview, as Id love feedback)
81
81
 
82
82
  ## Tools Used by Kash
83
83
 
@@ -17,14 +17,14 @@ I tried half a dozen different popular terminals on Mac
17
17
  [Hyper](https://hyper.is/)). Unfortunately, none offer really good support right out of
18
18
  the box, but I encourage you to try
19
19
 
20
- ✨**Would you be willing to help test something new?** If you've made it this far and are
20
+ ✨**Would you be willing to help test something new?** If youve made it this far and are
21
21
  still reading, I have a request.
22
- So alongside kash, I've begun to build a new terminal app, **Kerm**, that has the
22
+ So alongside kash, Ive begun to build a new terminal app, **Kerm**, that has the
23
23
  features we would want in a modern command line, such as clickable links and commands,
24
24
  tooltips, and image support.
25
25
  Kash also takes advantage of this support by embedding OSC 8 links.
26
26
  It is *so* much nicer to use.
27
- I'd like feedback so please [message me](https://twitter.com/ojoshe) if you'd like to
27
+ Id like feedback so please [message me](https://twitter.com/ojoshe) if youd like to
28
28
  try it out an early dev version!
29
29
 
30
30
  ### Choosing an Editor
@@ -34,7 +34,7 @@ Kash respects the `EDITOR` environment variable if you use the `edit` command.
34
34
 
35
35
  ### Using on macOS
36
36
 
37
- Kash calls `open` to open some files, so in general, it's convenient to make sure your
37
+ Kash calls `open` to open some files, so in general, its convenient to make sure your
38
38
  preferred editor is set up for `.yml` and `.md` files.
39
39
 
40
40
  For convenience, a reminder on how to do this:
@@ -42,7 +42,7 @@ For convenience, a reminder on how to do this:
42
42
  - In Finder, pick a `.md` or `.yml` file and hit Cmd-I (or right-click and select Get
43
43
  Info).
44
44
 
45
- - Select the editor, such as Cursor or VSCode or Obsidian, and click the "Change All…"
45
+ - Select the editor, such as Cursor or VSCode or Obsidian, and click the Change All…”
46
46
  button to have it apply to all files with that extension.
47
47
 
48
48
  - Repeat with each file type.
@@ -61,23 +61,23 @@ out of the box to edit workspace files in Markdown, HTML, and YAML in kash works
61
61
  Kash uses Markdown files with YAML frontmatter, which is fully compatible with
62
62
  [Obsidian](https://obsidian.md/). Some notes:
63
63
 
64
- - In Obsidian's preferences, under Editor, turn on "Strict line breaks".
64
+ - In Obsidians preferences, under Editor, turn on Strict line breaks”.
65
65
 
66
- - This makes the line breaks in kash's normalized Markdown output work well in Obsidian.
66
+ - This makes the line breaks in kashs normalized Markdown output work well in Obsidian.
67
67
 
68
68
  - Some kash files also contain HTML in Markdown.
69
- This works fine, but note that only the current line's HTML is shown in Obsidian.
69
+ This works fine, but note that only the current lines HTML is shown in Obsidian.
70
70
 
71
71
  - Install the [Front Matter Title
72
72
  plugin](https://github.com/snezhig/obsidian-front-matter-title):
73
73
 
74
- - Go to settings, enable community plugins, search for "Front Matter Title" and
74
+ - Go to settings, enable community plugins, search for Front Matter Title and
75
75
  install.
76
76
 
77
- - Under "Installed Plugins," adjust the settings to enable "Replace shown title in
78
- file explorer," "Replace shown title in graph," etc.
77
+ - Under Installed Plugins,” adjust the settings to enable Replace shown title in
78
+ file explorer,” Replace shown title in graph,” etc.
79
79
 
80
- - You probably want to keep the "Replace titles in header of leaves" off so you can
80
+ - You probably want to keep the Replace titles in header of leaves off so you can
81
81
  still see original filenames if needed.
82
82
 
83
83
  - Now titles are easy to read for all kash notes.
@@ -3,14 +3,14 @@
3
3
  > “*Civilization advances by extending the number of important operations which we can
4
4
  > perform without thinking about them.*” —Alfred North Whitehead
5
5
 
6
- Here is a bit more motivation for experimenting with kash, why I think it's potentially
6
+ Here is a bit more motivation for experimenting with kash, why I think its potentially
7
7
  so useful, and some design principles.
8
8
  (You may skip ahead to the next section if you just want a more concrete overview!)
9
9
 
10
- ### Why Apps Can't Solve All Your Problems
10
+ ### Why Apps Cant Solve All Your Problems
11
11
 
12
12
  AI has radically changed the way we use software.
13
- With LLMs and other generative AI models, we've seen big improvements in two areas:
13
+ With LLMs and other generative AI models, weve seen big improvements in two areas:
14
14
 
15
15
  1. Powerful general-purpose new AI tools (ChatGPT, Perplexity, etc.)
16
16
 
@@ -18,20 +18,20 @@ With LLMs and other generative AI models, we've seen big improvements in two are
18
18
  want to solve, like Notion, Figma, Descript, etc.
19
19
 
20
20
  While we have these powerful cloud apps, we all know numerous situations where our
21
- problems aren't easily solved or automated with single tool like ChatGPT, Notion, Google
21
+ problems arent easily solved or automated with single tool like ChatGPT, Notion, Google
22
22
  Docs, Slack, Excel, and Zapier.
23
23
 
24
24
  If you want to use any of the newest AI models and APIs for something not supported by
25
25
  an existing tool, you generally have to design and build it yourself—in Python and/or a
26
26
  full-stack web app.
27
27
 
28
- It's true tools like GitHub Copilot, Claude Code, and Cursor can help anyone write code
28
+ Its true tools like GitHub Copilot, Claude Code, and Cursor can help anyone write code
29
29
  much faster. But even if you have a tool like this, building polished apps that are good
30
30
  enough people will pay them takes time, and many good product ideas never get built.
31
- And the curse of [Conway's Law](https://en.wikipedia.org/wiki/Conway%27s_law) means many
32
- companies won't add specific features you want, or at best are likely to do it slowly.
31
+ And the curse of [Conways Law](https://en.wikipedia.org/wiki/Conway%27s_law) means many
32
+ companies wont add specific features you want, or at best are likely to do it slowly.
33
33
 
34
- In short, in spite of AI tools accelerating software, certain things don't change: we
34
+ In short, in spite of AI tools accelerating software, certain things dont change: we
35
35
  are waiting for developers, product managers, designers, and entrepreneurs to design and
36
36
  ship solutions for us.
37
37
 
@@ -58,9 +58,9 @@ Command-line shells generally still suffer from three big issues:
58
58
  - A text-based interface many find confusing or ugly
59
59
 
60
60
  - No easy, “native” support for modern tools, apps, and APIs (especially LLMs—and using
61
- `curl` to call OpenAI APIs doesn't count!)
61
+ `curl` to call OpenAI APIs doesnt count!)
62
62
 
63
- Even worse, command lines haven't gotten much better.
63
+ Even worse, command lines havent gotten much better.
64
64
  Few companies make money shipping new command-line tooling.
65
65
  (In the last few years this has slowly starting to change with tools like nushell, fish,
66
66
  and Warp.)
@@ -73,7 +73,7 @@ developer, a designer, or an enterpreneur building a product.
73
73
  Any tool that lets you solve complex problems yourself, without waiting for engineers
74
74
  and designers, can radically improve your productivity.
75
75
 
76
- I think it's a good time to revisit this idea.
76
+ I think its a good time to revisit this idea.
77
77
 
78
78
  In a post-LLM world, it should be possible to do more things without so much time and
79
79
  effort spent (even with the help of LLMs) on coding and UI/UX design.
@@ -84,7 +84,7 @@ to see how well it works.
84
84
 
85
85
  ### The Goals of Kash
86
86
 
87
- Kash is an experimental attempt at building the tool I've wanted for a long time, using
87
+ Kash is an experimental attempt at building the tool Ive wanted for a long time, using
88
88
  a command line as a starting point, and with an initial focus on content-related tasks.
89
89
 
90
90
  That brings us to the goals behind building a new, AI-native shell.
@@ -99,17 +99,17 @@ That brings us to the goals behind building a new, AI-native shell.
99
99
  - **Make complex tasks possible:** Highly complex tasks and workflows should be easy to
100
100
  assemble (and rerun if they need to be automated) by adding new primitive actions and
101
101
  combining primitive actions into more complex workflows.
102
- You shouldn't need to be a programmer to use any task—but any task should be
102
+ You shouldnt need to be a programmer to use any task—but any task should be
103
103
  extensible with arbitrary code (written by you and an LLM) when needed.
104
104
 
105
105
  - **Augment human skills and judgement:** Many AI agent efforts aim for pure automation.
106
106
  But even with powerful LLMs and tools, full automation is rare.
107
- Invariably, the best results come from human review wherever it's needed—experimenting
107
+ Invariably, the best results come from human review wherever its needed—experimenting
108
108
  with different models and prompts, looking at what works, focusing expert human
109
109
  attention in the right places.
110
110
  The most flexible tools augment, not replace, your ability to review and manipulate
111
111
  information. It should help both very technical users, like developers, as well as less
112
- technical but sophisticated users who aren't traditional programmers.
112
+ technical but sophisticated users who arent traditional programmers.
113
113
 
114
114
  - **Accelerate discovery of the workflows that work best:** We have so many powerful
115
115
  APIs, models, libraries, and tools now—but the real bottleneck is in discovering and
@@ -125,7 +125,7 @@ That brings us to the goals behind building a new, AI-native shell.
125
125
 
126
126
  A better command line like a first step toward an item-based information operating
127
127
  system—an alternate, more flexible UX and information architecture for knowledge
128
- workflows. My hope is that kash becomes the tool you need when you don't know what tool
128
+ workflows. My hope is that kash becomes the tool you need when you dont know what tool
129
129
  you need.
130
130
 
131
131
  ### Design Principles
@@ -155,7 +155,7 @@ Key design choices:
155
155
  transition)
156
156
 
157
157
  7. **Maintain context in workspaces** (keep files organized by project or effort in a
158
- folder that can be persisted, won't get lost, and includes content, metadata,
158
+ folder that can be persisted, wont get lost, and includes content, metadata,
159
159
  actions, settings, selections, caches, history, etc.)
160
160
 
161
161
  8. **Maintain metadata on files** (so you always know where each piece of content comes
@@ -8,7 +8,7 @@ extensibility of a modern command line interface.
8
8
 
9
9
  The philosophy behind kash is similar to Unix shell tools: simple commands that can be
10
10
  combined flexibly in powerful ways.
11
- It operates on "items" such as URLs, files, or Markdown notes within a workspace
11
+ It operates on items such as URLs, files, or Markdown notes within a workspace
12
12
  directory.
13
13
 
14
14
  This command-line is also AI enabled.
@@ -29,7 +29,7 @@ intuitive than old Unix commands.
29
29
  ### MCP Support
30
30
 
31
31
  If the idea of having lots of commands runnable by an LLM sounds to you a little like
32
- MCP, you're right. Any action in kash can also be an MCP tool!
32
+ MCP, youre right. Any action in kash can also be an MCP tool!
33
33
 
34
34
  You can connect Claude Desktop or Cursor or other MCP clients to kash and use any kash
35
35
  action as a tool. However, unlike the complexity of writing a new MCP server, the idea
@@ -41,22 +41,22 @@ Anyone, including kash itself, can write new actions.
41
41
  You write a simple Python function, add a decorator, and it becomes an action you can
42
42
  use in your shell.
43
43
 
44
- Finally, getting really useful things to work still takes effort, so I've also added a
44
+ Finally, getting really useful things to work still takes effort, so Ive also added a
45
45
  number of little libraries to help with this.
46
46
 
47
47
  ### Supporting Complex Tasks
48
48
 
49
- Because it's really just a set of Python libraries, kash is more capable than a typical
49
+ Because its really just a set of Python libraries, kash is more capable than a typical
50
50
  shell. It is starting to become a sort of AI-friendly scripting framework as well.
51
51
 
52
52
  Inputs and outputs of commands are stored as files, so you can easily chain commands
53
53
  together and inspect intermediate results.
54
54
 
55
55
  When possible, actions are nondestructive and idempotent—that is, they will either
56
- create new files or simply skip an operation if it's already complete.
56
+ create new files or simply skip an operation if its already complete.
57
57
 
58
58
  So it can work a bit like a Makefile: suppose you run a command like `transcribe` on a
59
- video. If you've already run that command on the same YouTube URL, kash knows it and can
59
+ video. If youve already run that command on the same YouTube URL, kash knows it and can
60
60
  recognize the downloaded video and transcribed text is already present in your current
61
61
  workspace.
62
62
 
@@ -95,7 +95,7 @@ original document), the sources are listed in a `derived_from` array within the
95
95
  This means actions can find citations or other data on the provenance of a given piece
96
96
  of information.
97
97
 
98
- This might sound a little complex, but it's quite simple in practice.
98
+ This might sound a little complex, but its quite simple in practice.
99
99
  All the metadata is in a standard format,
100
100
  [Frontmatter Format](https://github.com/jlevy/frontmatter-format), and the information
101
101
  is compatible with other apps and pretty self explanatory.
@@ -2,7 +2,7 @@
2
2
 
3
3
  A kash workspace is simply a directory of files.
4
4
  The goal is for a workspace to be easy to use not just with kash but with other editors
5
- or tools, so it's possible to edit, share, or commit files to version control.
5
+ or tools, so its possible to edit, share, or commit files to version control.
6
6
  It makes sense to devote a workspace to a single topic, project, or area of research.
7
7
 
8
8
  File formats and conventions:
@@ -2,7 +2,7 @@
2
2
 
3
3
  Many of us (myself included) have long believed in sticking with tried-and-true bash and
4
4
  the classic command-line tools.
5
- While it's still wise to know these tools, we've in recent years seen many new tools
5
+ While its still wise to know these tools, weve in recent years seen many new tools
6
6
  emerge that are more powerful, modern, and cross-platform.
7
7
 
8
8
  When using kash it makes sense to use these.
@@ -17,7 +17,7 @@ Anyone, including kash itself, can write new actions easily.
17
17
 
18
18
  The philosophy behind kash is similar to Unix shell tools: simple commands that can be
19
19
  combined in flexible and powerful ways.
20
- It operates on "items" such as URLs, files, or Markdown notes within a workspace
20
+ It operates on items such as URLs, files, or Markdown notes within a workspace
21
21
  directory.
22
22
 
23
23
  For more detailed information, you can run `help` to get background and a list of
@@ -42,7 +42,7 @@ questions.
42
42
  ### How does kash accept both shell and assistant requests to the LLM with natural language?
43
43
 
44
44
  By default, if a command is valid shell or Python, kash will treat it as a shell
45
- command, using xonsh's conventions.
45
+ command, using xonshs conventions.
46
46
 
47
47
  Commands that begin with a `?` are automatically considered assistant requests.
48
48
 
@@ -136,9 +136,9 @@ fit kash commands and actions, reading metadata on items, etc.
136
136
 
137
137
  ### Can kash replace my regular shell?
138
138
 
139
- While kash doesn't aim to completely replace all uses of the shell—for example, that's
139
+ While kash doesnt aim to completely replace all uses of the shell—for example, thats
140
140
  hard to do in general for remote use, and people have many constraints, customizations,
141
- and preferences—I've found it's highly useful for a lot of situations.
141
+ and preferences—Ive found its highly useful for a lot of situations.
142
142
  It is starting to replace bash or zsh for day-to-day local use on my laptop.
143
143
 
144
144
  Kash basically wraps xonsh, so you have almost all the functionality of xonsh and Python
@@ -154,18 +154,18 @@ Any command you type on the command-line in kash is a command.
154
154
  Some commands are basic, built-in commands.
155
155
  The idea is there are relatively few of these, and they do important primitive things
156
156
  like `select` (select or show selections), `show` (show an item), `files` (list
157
- files—kash's better version of `ls`), `workspace` (shows information about the current
157
+ files—kashs better version of `ls`), `workspace` (shows information about the current
158
158
  workspace), or `logs` (shows the detailed logs for the current workspace).
159
159
  In Python, built-in commands are defined by simple functions.
160
160
 
161
161
  But most commands are defined as an *action*. Actions are invoked just like any other
162
- command but have a standard structure: they are assumed to perform an "action" on a set
162
+ command but have a standard structure: they are assumed to perform an action on a set
163
163
  of items (files of known types) and then save those items, all within an existing
164
164
  workspace. Actions are defined as a subclass of `Action` in Python.
165
165
 
166
166
  ### Does nvm (Node version manager) work in kash?
167
167
 
168
- It's hard to get nvm to work well in xonsh, but try [fnm](https://github.com/Schniz/fnm)
168
+ Its hard to get nvm to work well in xonsh, but try [fnm](https://github.com/Schniz/fnm)
169
169
  instead! It works just as well and kash supports `fnm` automatically so it auto-detects
170
170
  and uses fnm to switch or install Node versions for directories with Node projects (i.e.
171
171
  there is an `.nvmrc`, `.node-version`, or `package.json` file).
@@ -6,7 +6,7 @@ You may simply ask a question and the kash assistant will help you.
6
6
  Press **space** (or type **?**), then write your question or request.
7
7
  Use `logs` for detailed logs.
8
8
 
9
- *I'd love to hear from you with issues, bugs, and ideas.
9
+ *Id love to hear from you with issues, bugs, and ideas.
10
10
  Discuss at github.com/jlevy/kash or contact me github.com/jlevy or x.com/ojoshe (DMs
11
11
  open).*
12
12
 
@@ -1,16 +1,18 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import ast
4
+ import json
4
5
  from collections.abc import Iterable
5
6
  from pathlib import Path
6
- from typing import TYPE_CHECKING, TypeAlias
7
+ from typing import TYPE_CHECKING, Any, TypeAlias
7
8
 
9
+ import pandas as pd
8
10
  from pydantic.dataclasses import dataclass
9
11
  from strif import abbrev_list
10
12
 
11
13
  from kash.config.logger import get_logger
12
14
  from kash.llm_utils.init_litellm import init_litellm
13
- from kash.llm_utils.llms import DEFAULT_EMBEDDING_MODEL
15
+ from kash.llm_utils.llms import DEFAULT_EMBEDDING_MODEL, EmbeddingModel
14
16
 
15
17
  if TYPE_CHECKING:
16
18
  from pandas import DataFrame
@@ -18,15 +20,26 @@ if TYPE_CHECKING:
18
20
  log = get_logger(__name__)
19
21
 
20
22
 
21
- BATCH_SIZE = 1024
23
+ BATCH_SIZE: int = 1024
22
24
 
23
25
  Key: TypeAlias = str
24
26
 
25
- KeyVal: TypeAlias = tuple[Key, str]
26
- """
27
- A key-value pair where the key is a unique identifier (such as the path)
28
- and the value is the text to embed.
29
- """
27
+
28
+ @dataclass(frozen=True)
29
+ class EmbValue:
30
+ emb_text: str
31
+ data: dict[str, Any] | None = None
32
+
33
+
34
+ @dataclass(frozen=True)
35
+ class KeyVal:
36
+ """
37
+ A key-value pair where the key is a unique identifier (such as the path)
38
+ and the value is the text to embed and any additional data.
39
+ """
40
+
41
+ key: Key
42
+ value: EmbValue
30
43
 
31
44
 
32
45
  @dataclass
@@ -36,39 +49,45 @@ class Embeddings:
36
49
  small texts, the text itself).
37
50
  """
38
51
 
39
- data: dict[Key, tuple[str, list[float]]]
40
- """Mapping of key to text and embedding."""
52
+ data: dict[Key, tuple[EmbValue, list[float]]]
53
+ """Mapping of key to EmbValue and embedding."""
41
54
 
42
- def as_iterable(self) -> Iterable[tuple[Key, str, list[float]]]:
43
- return ((key, text, emb) for key, (text, emb) in self.data.items())
55
+ def as_iterable(self) -> Iterable[tuple[Key, EmbValue, list[float]]]:
56
+ return ((key, emb_value, emb) for key, (emb_value, emb) in self.data.items())
44
57
 
45
58
  def as_df(self) -> DataFrame:
46
59
  from pandas import DataFrame
47
60
 
48
- keys, texts, embeddings = zip(
49
- *[(key, text, emb) for key, (text, emb) in self.data.items()], strict=False
50
- )
61
+ if not self.data:
62
+ return DataFrame({"key": [], "text": [], "data": [], "embedding": []})
63
+
64
+ items = [(key, emb_value, emb) for key, (emb_value, emb) in self.data.items()]
65
+ keys, emb_values, embeddings = zip(*items, strict=False)
66
+
51
67
  return DataFrame(
52
68
  {
53
- "key": keys,
54
- "text": texts,
55
- "embedding": embeddings,
69
+ "key": list(keys),
70
+ "text": [ev.emb_text for ev in emb_values],
71
+ "data": [ev.data for ev in emb_values],
72
+ "embedding": list(embeddings),
56
73
  }
57
74
  )
58
75
 
59
- def __getitem__(self, key: Key) -> tuple[str, list[float]]:
76
+ def __getitem__(self, key: Key) -> tuple[EmbValue, list[float]]:
60
77
  if key in self.data:
61
78
  return self.data[key]
62
79
  else:
63
80
  raise KeyError(f"Key '{key}' not found in embeddings")
64
81
 
65
82
  @classmethod
66
- def embed(cls, keyvals: list[KeyVal], model=DEFAULT_EMBEDDING_MODEL) -> Embeddings:
83
+ def embed(
84
+ cls, keyvals: list[KeyVal], model: EmbeddingModel = DEFAULT_EMBEDDING_MODEL
85
+ ) -> Embeddings:
67
86
  from litellm import embedding
68
87
 
69
88
  init_litellm()
70
89
 
71
- data = {}
90
+ data: dict[Key, tuple[EmbValue, list[float]]] = {}
72
91
  log.info(
73
92
  "Embedding %d texts (model %s, batch size %s)…",
74
93
  len(keyvals),
@@ -76,21 +95,23 @@ class Embeddings:
76
95
  BATCH_SIZE,
77
96
  )
78
97
  for batch_start in range(0, len(keyvals), BATCH_SIZE):
79
- batch_end = batch_start + BATCH_SIZE
80
- batch = keyvals[batch_start:batch_end]
81
- keys = [kv[0] for kv in batch]
82
- texts = [kv[1] for kv in batch]
98
+ batch_end: int = batch_start + BATCH_SIZE
99
+ batch: list[KeyVal] = keyvals[batch_start:batch_end]
100
+ keys: list[Key] = [kv.key for kv in batch]
101
+ texts: list[str] = [kv.value.emb_text for kv in batch]
83
102
 
84
103
  response = embedding(model=model.litellm_name, input=texts)
85
104
 
86
105
  if not response.data:
87
106
  raise ValueError("No embedding response data")
88
107
 
89
- batch_embeddings = [e["embedding"] for e in response.data]
108
+ batch_embeddings: list[list[float]] = [e["embedding"] for e in response.data]
90
109
  data.update(
91
110
  {
92
- key: (text, emb)
93
- for key, text, emb in zip(keys, texts, batch_embeddings, strict=False)
111
+ key: (emb_value, emb)
112
+ for key, emb_value, emb in zip(
113
+ keys, [kv.value for kv in batch], batch_embeddings, strict=False
114
+ )
94
115
  }
95
116
  )
96
117
 
@@ -110,32 +131,82 @@ class Embeddings:
110
131
  def read_from_csv(cls, path: Path) -> Embeddings:
111
132
  import pandas as pd
112
133
 
113
- df = pd.read_csv(path)
134
+ df: pd.DataFrame = pd.read_csv(path)
114
135
  df["embedding"] = df["embedding"].apply(ast.literal_eval)
115
- data = {row["key"]: (row["text"], row["embedding"]) for _, row in df.iterrows()}
116
- return cls(data=data) # pyright: ignore
136
+
137
+ # Handle missing data column just in case.
138
+ if "data" in df.columns:
139
+ df["data"] = df["data"].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else None)
140
+ else:
141
+ df["data"] = None
142
+
143
+ data: dict[Key, tuple[EmbValue, list[float]]] = {}
144
+ for _, row in df.iterrows():
145
+ key = str(row["key"])
146
+ text = str(row["text"])
147
+ embedding = list(row["embedding"])
148
+ # Type-safe handling of data column
149
+ raw_data = row["data"] if "data" in df.columns else None
150
+ data_value: dict[str, Any] | None = (
151
+ raw_data if isinstance(raw_data, dict) or raw_data is None else None
152
+ )
153
+
154
+ data[key] = (
155
+ EmbValue(emb_text=text, data=data_value),
156
+ embedding,
157
+ )
158
+
159
+ return cls(data=data)
117
160
 
118
161
  def to_npz(self, path: Path) -> None:
119
162
  """Save embeddings in numpy's compressed format."""
120
163
  import numpy as np
121
164
 
122
165
  keys: list[Key] = list(self.data.keys())
123
- texts: list[str] = [self.data[k][0] for k in keys]
166
+ texts: list[str] = [self.data[k][0].emb_text for k in keys]
167
+ # Serialize data as JSON strings
168
+ data_strings: list[str] = [
169
+ json.dumps(self.data[k][0].data) if self.data[k][0].data is not None else ""
170
+ for k in keys
171
+ ]
124
172
  embeddings = np.array([self.data[k][1] for k in keys])
125
- np.savez_compressed(path, keys=keys, texts=texts, embeddings=embeddings)
173
+ np.savez_compressed(
174
+ path,
175
+ keys=keys,
176
+ texts=texts,
177
+ data=data_strings,
178
+ embeddings=embeddings,
179
+ )
126
180
 
127
181
  @classmethod
128
182
  def read_from_npz(cls, path: Path) -> Embeddings:
129
183
  """Load embeddings from numpy's compressed format."""
130
184
  import numpy as np
131
185
 
132
- with np.load(path) as data:
133
- loaded_data = {
134
- k: (t, e.tolist())
135
- for k, t, e in zip(data["keys"], data["texts"], data["embeddings"], strict=False)
136
- }
186
+ with np.load(path) as npz_data:
187
+ if "data" in npz_data.files:
188
+ data_array = npz_data["data"]
189
+ else:
190
+ # No data column, so no data.
191
+ data_array = None
192
+
193
+ loaded_data: dict[Key, tuple[EmbValue, list[float]]] = {}
194
+ for i, (k, t, e) in enumerate(
195
+ zip(
196
+ npz_data["keys"],
197
+ npz_data["texts"],
198
+ npz_data["embeddings"],
199
+ strict=False,
200
+ )
201
+ ):
202
+ data_str = data_array[i] if data_array is not None else ""
203
+ loaded_data[k] = (
204
+ EmbValue(emb_text=t, data=json.loads(data_str) if data_str else None),
205
+ e.tolist(),
206
+ )
207
+
137
208
  return cls(data=loaded_data)
138
209
 
139
210
  def __str__(self) -> str:
140
- dims = -1 if len(self.data) == 0 else len(next(iter(self.data))[1])
211
+ dims: int = -1 if len(self.data) == 0 else len(next(iter(self.data.values()))[1])
141
212
  return f"Embeddings({len(self.data)} items, {dims} dimensions)"
@@ -52,8 +52,8 @@ def rank_by_relatedness(
52
52
  query_embedding = response.data[0]["embedding"]
53
53
 
54
54
  scored_strings = [
55
- (key, text, relatedness_fn(query_embedding, emb))
56
- for key, text, emb in embeddings.as_iterable()
55
+ (key, emb_value.emb_text, relatedness_fn(query_embedding, emb))
56
+ for key, emb_value, emb in embeddings.as_iterable()
57
57
  ]
58
58
  scored_strings.sort(key=lambda x: x[2], reverse=True)
59
59