@thor123141245r/ai-translate 0.0.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. package/README.md +71 -22
  2. package/dist/cli.d.ts.map +1 -1
  3. package/dist/cli.js +77 -0
  4. package/dist/cli.js.map +1 -1
  5. package/dist/i18n/align.d.ts +3 -0
  6. package/dist/i18n/align.d.ts.map +1 -0
  7. package/dist/i18n/align.js +46 -0
  8. package/dist/i18n/align.js.map +1 -0
  9. package/dist/i18n/cache.d.ts +14 -0
  10. package/dist/i18n/cache.d.ts.map +1 -0
  11. package/dist/i18n/cache.js +36 -0
  12. package/dist/i18n/cache.js.map +1 -0
  13. package/dist/i18n/command.d.ts +7 -0
  14. package/dist/i18n/command.d.ts.map +1 -0
  15. package/dist/i18n/command.js +84 -0
  16. package/dist/i18n/command.js.map +1 -0
  17. package/dist/i18n/extract.d.ts +7 -0
  18. package/dist/i18n/extract.d.ts.map +1 -0
  19. package/dist/i18n/extract.js +24 -0
  20. package/dist/i18n/extract.js.map +1 -0
  21. package/dist/i18n/json-path.d.ts +7 -0
  22. package/dist/i18n/json-path.d.ts.map +1 -0
  23. package/dist/i18n/json-path.js +61 -0
  24. package/dist/i18n/json-path.js.map +1 -0
  25. package/dist/i18n/parse.d.ts +6 -0
  26. package/dist/i18n/parse.d.ts.map +1 -0
  27. package/dist/i18n/parse.js +44 -0
  28. package/dist/i18n/parse.js.map +1 -0
  29. package/dist/i18n/placeholders.d.ts +7 -0
  30. package/dist/i18n/placeholders.d.ts.map +1 -0
  31. package/dist/i18n/placeholders.js +44 -0
  32. package/dist/i18n/placeholders.js.map +1 -0
  33. package/dist/i18n/prompt.d.ts +13 -0
  34. package/dist/i18n/prompt.d.ts.map +1 -0
  35. package/dist/i18n/prompt.js +37 -0
  36. package/dist/i18n/prompt.js.map +1 -0
  37. package/dist/i18n/translate.d.ts +29 -0
  38. package/dist/i18n/translate.d.ts.map +1 -0
  39. package/dist/i18n/translate.js +199 -0
  40. package/dist/i18n/translate.js.map +1 -0
  41. package/dist/index.d.ts +1 -0
  42. package/dist/index.d.ts.map +1 -1
  43. package/dist/index.js +1 -0
  44. package/dist/index.js.map +1 -1
  45. package/package.json +5 -1
  46. package/.agentdocs/code-changes/2026-01-22/AI/347/277/273/350/257/221/345/231/250TS/345/256/236/347/216/260-/345/256/236/347/216/260.md +0 -22
  47. package/.agentdocs/code-changes/2026-01-23/CLI/345/210/206/345/217/221-npx/345/256/236/347/216/260.md +0 -18
  48. package/.agentdocs/code-changes/2026-01-23/sora-watermask-remover-/345/233/275/351/231/205/345/214/226/347/277/273/350/257/221-/345/256/236/347/216/260.md +0 -37
  49. package/.agentdocs/code-changes/2026-01-23//351/205/215/347/275/256/350/257/273/345/217/226-/347/216/257/345/242/203/345/217/230/351/207/217/344/274/230/345/205/210-/345/256/236/347/216/260.md +0 -22
  50. package/.agentdocs/plans/2026-01-22/AI/347/277/273/350/257/221/345/231/250TS/345/256/236/347/216/260-/344/274/230/345/214/226/346/226/271/346/241/210.md +0 -67
  51. package/.agentdocs/plans/2026-01-23/CLI/345/210/206/345/217/221-npx/346/226/271/346/241/210.md +0 -60
  52. package/.agentdocs/plans/2026-01-23/sora-watermask-remover-/345/233/275/351/231/205/345/214/226/347/277/273/350/257/221-/344/274/230/345/214/226/346/226/271/346/241/210.md +0 -51
  53. package/.agentdocs/plans/2026-01-23//351/205/215/347/275/256/350/257/273/345/217/226-/347/216/257/345/242/203/345/217/230/351/207/217/344/274/230/345/205/210-/344/274/230/345/214/226/346/226/271/346/241/210.md +0 -80
  54. package/SKILL.md +0 -103
  55. package/src/asyncTransform.ts +0 -31
  56. package/src/bin/ai-translate.ts +0 -5
  57. package/src/cli.ts +0 -313
  58. package/src/index.ts +0 -9
  59. package/src/logger.ts +0 -3
  60. package/src/model.ts +0 -139
  61. package/src/prompt.ts +0 -71
  62. package/src/split.ts +0 -111
  63. package/src/utils.ts +0 -15
  64. package/tsconfig.json +0 -19
package/src/cli.ts DELETED
@@ -1,313 +0,0 @@
1
- import fs from 'node:fs'
2
- import path from 'node:path'
3
- import { pipeline } from 'node:stream/promises'
4
- import { initProcLog } from 'debug-level'
5
- import { AppConfig, StringSchema, v } from '@commenthol/app-config'
6
- import type { SupportedTextSplitterLanguage } from '@langchain/textsplitters'
7
- import { logger } from './logger.js'
8
- import { modelFactory, AiTranslateTransform } from './model.js'
9
- import {
10
- TextSplitterStream,
11
- recursiveChunkTextSplitter,
12
- getFormatByExtension
13
- } from './split.js'
14
-
15
- const log = logger('cli')
16
-
17
- const help: Record<string, string> = {}
18
-
19
- help._ = `
20
- AI Translator
21
-
22
- Usage:
23
- ai-translate [flags]
24
- ai-translate [command] [flags]
25
-
26
- Commands:
27
- set set config value
28
-
29
- Flags:
30
- -h, --help Help for ai-translate
31
- -v, --version Show version information
32
- -c, --config DIR Use config file .ai-translate.json in DIR
33
- -f, --from LANG Source language
34
- -t, --to LANG Target language; LANG is English language name or
35
- supported BCP47 codes (ar, de, en, es, fr, ja, pt, ru,
36
- vi, zh-CN, zh-TW)
37
- -i, --input FILE input file
38
- -o, --output FILE output file
39
- --format FORMAT specify input format (cpp, go, java, js, php, proto,
40
- python, rst, ruby, rust, scala, swift, markdown, latex,
41
- html, sol)
42
- Examples:
43
- Translate input.md from Spanish to output.md in English
44
- ai-translate -f Spanish -t English -i input.md -o output.md
45
-
46
- Pipe from stdin to stdout using the config in the local folder
47
- echo "translate" | ai-translate -f en -t en -c .
48
-
49
- Use "ai-translate [command] --help" for more information about a command.
50
- `
51
-
52
- help.set = `
53
- Set ai-translate configuration
54
-
55
- Writes config to \`.ai-translate.json\`
56
- If --config flag is omitted then global config is used.
57
-
58
- Usage:
59
- ai-translate [flags] set KEY VALUE
60
-
61
- Flags:
62
- -c, --config DIR Use config file .ai-translate.json in DIR
63
-
64
- Available KEYs:
65
- provider set provider (ollama, mistral, anthropic, openai, deepseek);
66
- default="ollama"
67
- model set model from provider; default="qwen2.5:7b"
68
- apiKey set api key
69
- baseUrl baseUrl for model
70
- temperature model temperature; default=0.1
71
- maxRetries max. number of retries; default=10
72
- chunkSize number of chunks used in text-splitter; default=1000
73
- `
74
-
75
- const PACKAGE_JSON = '../package.json'
76
- const APP = 'ai-translate'
77
- const CONF_FILE = `.${APP}.json`
78
- const DEFAULT_LANG = 'en'
79
-
80
- const commands: Record<string, (cmd: Record<string, unknown>, argv: string[]) => void> = {
81
- set: (c, argv) => {
82
- const key = nextArg(argv)
83
- const value = nextArg(argv)
84
- c.set = [key, value]
85
- }
86
- }
87
-
88
- const flags: Record<string, (opts: Record<string, unknown>, argv: string[]) => void> = {
89
- '--help': (f) => (f.help = true),
90
- '--version': (f) => (f.version = true),
91
- '--config': (f, argv) => {
92
- const dir = nextArg(argv)
93
- if (dir) f.config = path.resolve(process.cwd(), dir)
94
- },
95
- '--from': (f, argv) => {
96
- const lang = nextArg(argv)
97
- f.sourceLanguage = lang || DEFAULT_LANG
98
- },
99
- '--to': (f, argv) => {
100
- const lang = nextArg(argv)
101
- f.targetLanguage = lang || DEFAULT_LANG
102
- },
103
- '--input': (f, argv) => {
104
- const filename = nextArg(argv)
105
- if (filename) f.input = path.resolve(process.cwd(), filename)
106
- },
107
- '--output': (f, argv) => {
108
- const filename = nextArg(argv)
109
- if (filename) f.output = path.resolve(process.cwd(), filename)
110
- },
111
- '--format': (f, argv) => {
112
- const format = nextArg(argv)
113
- if (format) f.format = format
114
- }
115
- }
116
- flags['-h'] = flags['--help']
117
- flags['-v'] = flags['--version']
118
- flags['-c'] = flags['--config']
119
- flags['-f'] = flags['--from']
120
- flags['-t'] = flags['--to']
121
- flags['-i'] = flags['--input']
122
- flags['-o'] = flags['--output']
123
-
124
- export const argvParse = (args?: string[]) => {
125
- initProcLog()
126
- const argv = args || process.argv.slice(2)
127
- log.debug(argv)
128
- const opts: Record<string, unknown> = {
129
- sourceLanguage: 'en',
130
- targetLanguage: 'en'
131
- }
132
- const cmd: Record<string, unknown> = {}
133
-
134
- while (argv.length) {
135
- const arg = argv.shift()
136
- if (!arg) continue
137
- if (commands[arg]) {
138
- commands[arg](cmd, argv)
139
- } else if (flags[arg]) {
140
- flags[arg](opts, argv)
141
- }
142
- }
143
- return { cmd, opts }
144
- }
145
-
146
- const nextArg = (argv: string[]) => {
147
- const next = argv[0] || ''
148
- if (next.indexOf('-') === 0) {
149
- return
150
- }
151
- return argv.shift()
152
- }
153
-
154
- let _console = console
155
- export const _injectConsole = (obj: typeof console) => {
156
- _console = obj
157
- }
158
-
159
- const displayError = (msg: string) => {
160
- _console.error(`ERROR: ${msg}`)
161
- }
162
-
163
- const display = (msg: string) => _console.log(msg)
164
-
165
- const version = () => {
166
- const packageJson = new URL(PACKAGE_JSON, import.meta.url)
167
- const { version: pkgVersion } = JSON.parse(
168
- fs.readFileSync(packageJson, 'utf-8')
169
- ) as { version: string }
170
- display(pkgVersion)
171
- }
172
-
173
- const schema = {
174
- provider: StringSchema,
175
- model: StringSchema,
176
- apiKey: StringSchema,
177
- baseUrl: StringSchema,
178
- temperature: v.pipe(
179
- v.string(),
180
- v.transform(Number),
181
- v.minValue(0),
182
- v.maxValue(2)
183
- ),
184
- maxRetries: v.pipe(
185
- v.string(),
186
- v.transform(Number),
187
- v.integer(),
188
- v.minValue(0),
189
- v.maxValue(99)
190
- ),
191
- chunkSize: v.pipe(
192
- v.string(),
193
- v.transform(Number),
194
- v.integer(),
195
- v.minValue(100),
196
- v.maxValue(200e3)
197
- )
198
- }
199
-
200
- const PROVIDER_API_KEY_ENV: Record<string, string> = {
201
- openai: 'OPENAI_API_KEY',
202
- anthropic: 'ANTHROPIC_API_KEY',
203
- mistral: 'MISTRAL_API_KEY',
204
- deepseek: 'DEEPSEEK_API_KEY'
205
- }
206
-
207
- const PROVIDER_BASE_URL_ENV: Record<string, string> = {
208
- openai: 'OPENAI_BASE_URL',
209
- anthropic: 'ANTHROPIC_BASE_URL',
210
- mistral: 'MISTRAL_BASE_URL',
211
- deepseek: 'DEEPSEEK_BASE_URL',
212
- ollama: 'OLLAMA_BASE_URL'
213
- }
214
-
215
- const readEnv = (key?: string) => {
216
- if (!key) return
217
- const value = process.env[key]
218
- if (!value) return
219
- const trimmed = value.trim()
220
- return trimmed ? trimmed : undefined
221
- }
222
-
223
- const pickFirst = (...values: Array<string | undefined>) =>
224
- values.find((value) => value && value.length > 0)
225
-
226
- const resolveRuntimeConfig = (config: Record<string, unknown>) => {
227
- const provider = String(config.provider || 'ollama')
228
- const envApiKey = pickFirst(
229
- readEnv('AI_TRANSLATE_API_KEY'),
230
- readEnv(PROVIDER_API_KEY_ENV[provider])
231
- )
232
- const envBaseUrl = pickFirst(
233
- readEnv('AI_TRANSLATE_BASE_URL'),
234
- readEnv(PROVIDER_BASE_URL_ENV[provider])
235
- )
236
-
237
- return {
238
- ...config,
239
- ...(envApiKey ? { apiKey: envApiKey } : {}),
240
- ...(envBaseUrl ? { baseUrl: envBaseUrl } : {})
241
- }
242
- }
243
-
244
- export const cli = async (args?: string[]) => {
245
- const { cmd, opts } = argvParse(args)
246
- const command = Object.keys(cmd)[0]
247
- log.debug({ cmd, opts })
248
-
249
- const filename = !opts.config
250
- ? CONF_FILE
251
- : fs.lstatSync(String(opts.config)).isDirectory()
252
- ? path.resolve(String(opts.config), CONF_FILE)
253
- : String(opts.config)
254
-
255
- const appConf = new AppConfig({ appName: APP, schema, filename })
256
- await appConf.read().catch(() => null)
257
-
258
- try {
259
- if (opts.version) {
260
- version()
261
- return
262
- }
263
- if (opts.help) {
264
- const helpText = help[command || ''] || help._
265
- display(helpText)
266
- return
267
- }
268
- if (cmd.set) {
269
- const [key, value] = cmd.set as [string | undefined, string | undefined]
270
- if (key) {
271
- appConf.set(key, value)
272
- await appConf.write()
273
- } else {
274
- display(JSON.stringify(appConf.config, null, 2))
275
- }
276
- return
277
- }
278
-
279
- const sourceLanguage = String(opts.sourceLanguage || DEFAULT_LANG)
280
- const targetLanguage = String(opts.targetLanguage || DEFAULT_LANG)
281
-
282
- const reader = opts.input ? fs.createReadStream(String(opts.input)) : process.stdin
283
- const writer = opts.output ? fs.createWriteStream(String(opts.output)) : process.stdout
284
-
285
- const runtimeConfig = resolveRuntimeConfig(
286
- appConf.config as Record<string, unknown>
287
- )
288
- const model = modelFactory(runtimeConfig)
289
- const lcNamespace =
290
- (model as unknown as { lc_namespace?: Record<string, unknown> }).lc_namespace || {}
291
- const lcKwargs = (model as unknown as { lc_kwargs?: Record<string, unknown> }).lc_kwargs || {}
292
- const { apiKey: _apiKey, ...modelParams } = { ...lcNamespace, ...lcKwargs } as Record<string, unknown>
293
- log.debug(modelParams)
294
-
295
- const format =
296
- (opts.format as SupportedTextSplitterLanguage | undefined) ||
297
- getFormatByExtension(path.extname(String(opts.input || '_.md')))
298
- const chunkSize = (runtimeConfig as { chunkSize?: number }).chunkSize || 1000
299
- const textSplitter = recursiveChunkTextSplitter({ chunkSize, format })
300
- const splitter = new TextSplitterStream({ textSplitter })
301
- const translator = new AiTranslateTransform({
302
- ...(opts as Record<string, unknown>),
303
- sourceLanguage,
304
- targetLanguage,
305
- model
306
- })
307
-
308
- await pipeline(reader, splitter, translator, writer)
309
- } catch (err) {
310
- log.debug(err)
311
- displayError((err as Error).message)
312
- }
313
- }
package/src/index.ts DELETED
@@ -1,9 +0,0 @@
1
- export { AiTranslateTransform, modelFactory } from './model.js'
2
- export { promptInvoke, languages } from './prompt.js'
3
- export {
4
- TextSplitterStream,
5
- recursiveChunkTextSplitter,
6
- getFormatByExtension
7
- } from './split.js'
8
- export type { Metadata, ModelFactoryOptions } from './model.js'
9
- export type { TextSplitterParams, Separator } from './split.js'
package/src/logger.ts DELETED
@@ -1,3 +0,0 @@
1
- import { ProcLog } from 'debug-level'
2
-
3
- export const logger = (namespace: string) => new ProcLog(`ai-translate:${namespace}`)
package/src/model.ts DELETED
@@ -1,139 +0,0 @@
1
- import { ChatOllama } from '@langchain/ollama'
2
- import { ChatMistralAI } from '@langchain/mistralai'
3
- import { ChatAnthropic } from '@langchain/anthropic'
4
- import { ChatOpenAI } from '@langchain/openai'
5
- import { ChatDeepSeek } from '@langchain/deepseek'
6
- import type { BaseChatModel } from '@langchain/core/language_models/chat_models'
7
- import { AsyncTransform } from './asyncTransform.js'
8
- import { promptInvoke, replaceMarkerSymbol } from './prompt.js'
9
- import { logger } from './logger.js'
10
- import { isWhiteSpace, preserveWhiteSpace } from './utils.js'
11
-
12
- const log = logger('model')
13
-
14
- const DEFAULT = {
15
- temperature: 0.1,
16
- maxRetries: 10,
17
- maxConcurrency: 1
18
- }
19
-
20
- export type Metadata = {
21
- inputTokens: number
22
- outputTokens: number
23
- }
24
-
25
- export type ModelFactoryOptions = {
26
- provider?: 'ollama' | 'mistral' | 'anthropic' | 'openai' | 'deepseek'
27
- [key: string]: unknown
28
- }
29
-
30
- export const modelFactory = (modelOpts?: ModelFactoryOptions): BaseChatModel => {
31
- const { provider = 'ollama', ...other } = modelOpts || {}
32
- const apiKeyValue = typeof other.apiKey === 'string' ? other.apiKey.trim() : ''
33
- const baseUrlValue = typeof other.baseUrl === 'string' ? other.baseUrl.trim() : ''
34
- const apiKey = apiKeyValue ? apiKeyValue : undefined
35
- const baseUrl = baseUrlValue ? baseUrlValue : undefined
36
- const filtered = { ...other }
37
- if ('apiKey' in filtered) {
38
- delete filtered.apiKey
39
- }
40
- if ('baseUrl' in filtered) {
41
- delete filtered.baseUrl
42
- }
43
-
44
- switch (provider) {
45
- case 'ollama':
46
- return new ChatOllama({
47
- ...DEFAULT,
48
- model: 'qwen2.5:7b',
49
- ...(baseUrl ? { baseUrl } : {}),
50
- ...filtered
51
- })
52
- case 'mistral':
53
- return new ChatMistralAI({
54
- ...DEFAULT,
55
- model: 'ministral-8b',
56
- ...(apiKey ? { apiKey } : {}),
57
- ...filtered
58
- })
59
- case 'anthropic':
60
- return new ChatAnthropic({
61
- ...DEFAULT,
62
- model: 'claude-3-5-haiku-20241022',
63
- ...(apiKey ? { apiKey } : {}),
64
- ...filtered
65
- })
66
- case 'openai':
67
- return new ChatOpenAI({
68
- ...DEFAULT,
69
- model: 'gpt-4o-mini',
70
- ...(apiKey ? { apiKey } : {}),
71
- ...(baseUrl ? { configuration: { baseURL: baseUrl } } : {}),
72
- ...filtered
73
- })
74
- case 'deepseek':
75
- return new ChatDeepSeek({
76
- ...DEFAULT,
77
- model: 'deepseek-reasoner',
78
- ...(apiKey ? { apiKey } : {}),
79
- ...(baseUrl ? { configuration: { baseURL: baseUrl } } : {}),
80
- ...filtered
81
- })
82
- default:
83
- throw new Error(`unsupported provider=${provider}`)
84
- }
85
- }
86
-
87
- export class AiTranslateTransform extends AsyncTransform {
88
- private readonly _model: BaseChatModel
89
- private readonly _promptOpts: {
90
- format?: string
91
- sourceLanguage: string
92
- targetLanguage: string
93
- }
94
- private _metadata: Metadata = { inputTokens: 0, outputTokens: 0 }
95
-
96
- constructor(options: {
97
- model: BaseChatModel
98
- format?: string
99
- sourceLanguage: string
100
- targetLanguage: string
101
- [key: string]: unknown
102
- }) {
103
- const { model, format, sourceLanguage, targetLanguage, ...rest } = options
104
- super(rest)
105
- this._model = model
106
- this._promptOpts = { format, sourceLanguage, targetLanguage }
107
- }
108
-
109
- getMetadata() {
110
- return { ...this._metadata }
111
- }
112
-
113
- protected async _asyncTransform(
114
- chunk: Buffer | object,
115
- encoding: BufferEncoding | 'buffer'
116
- ) {
117
- const text = AsyncTransform.toString(chunk, encoding)
118
- if (isWhiteSpace(text)) {
119
- return !this.push(text)
120
- }
121
- log.debug('inp=%j', text)
122
- const messages = await promptInvoke({ ...this._promptOpts, text })
123
- const result = await this._model.invoke(messages)
124
- log.debug('out=%j', result.content)
125
-
126
- const usage = (result as { usage_metadata?: { input_tokens?: number; output_tokens?: number } })
127
- .usage_metadata
128
- const inputTokens = usage?.input_tokens ?? 0
129
- const outputTokens = usage?.output_tokens ?? 0
130
-
131
- this.emit('metadata', { inputTokens, outputTokens })
132
- this._metadata.inputTokens += inputTokens
133
- this._metadata.outputTokens += outputTokens
134
-
135
- return !this.push(
136
- preserveWhiteSpace(text, replaceMarkerSymbol(result.content))
137
- )
138
- }
139
- }
package/src/prompt.ts DELETED
@@ -1,71 +0,0 @@
1
- import { ChatPromptTemplate } from '@langchain/core/prompts'
2
- import type { ChatPromptValueInterface } from '@langchain/core/prompt_values'
3
- import type { MessageContent } from '@langchain/core/messages'
4
- import { logger } from './logger.js'
5
-
6
- const log = logger('prompt')
7
-
8
- export const languages: Record<string, string> = {
9
- ar: 'Arabic',
10
- de: 'German',
11
- en: 'English',
12
- es: 'Spanish',
13
- fr: 'French',
14
- ja: 'Japanese',
15
- pt: 'Portuguese',
16
- ru: 'Russian',
17
- vi: 'Vietnamese',
18
- 'zh-CN': 'Chinese-simplified',
19
- 'zh-TW': 'Chinese-traditional'
20
- }
21
-
22
- export const getLanguageName = (lang: string) => languages[lang] || lang
23
-
24
- const systemPrompt =
25
- 'You are an AI-driven advanced translation system, specifically designed to ' +
26
- 'translate structured and technical documents. ' +
27
- 'You will receive a text snippet from a file formatted as "{format}"\n\n' +
28
- 'Your task is to **accurately translate** the text enclosed between the 🔤 ' +
29
- 'symbols from "{sourceLanguage}" to "{targetLanguage}". ' +
30
- 'Preserve the original formatting, sentence structure, and terminology. ' +
31
- 'Ensure that every word and sentence is translated as closely as possible to ' +
32
- 'the original meaning, without summarizing or omitting any part of the content. ' +
33
- 'The translation must be faithful, detailed, and maintain the original length ' +
34
- 'and complexity. ' +
35
- 'Deliver the translation exactly as required, without any additional ' +
36
- 'commentary or explanation, and ensure the 🔤 symbols are removed in the final output.\n\n' +
37
- 'Remember: your job is to **translate** the text exactly as it is, without ' +
38
- 'adding summaries or changing the content in any way. ' +
39
- 'Do not skip or modify any part of the text. Ensure that the output is a ' +
40
- 'direct translation, and that the original structure and meaning are preserved.'
41
-
42
- const promptTemplate = ChatPromptTemplate.fromMessages([
43
- ['system', systemPrompt],
44
- ['user', '🔤{text}🔤']
45
- ])
46
-
47
- export const promptInvoke = ({
48
- format = 'markdown',
49
- sourceLanguage,
50
- targetLanguage,
51
- text = ''
52
- }: {
53
- format?: string
54
- sourceLanguage: string
55
- targetLanguage: string
56
- text: string
57
- }): Promise<ChatPromptValueInterface> =>
58
- promptTemplate.invoke({
59
- format,
60
- sourceLanguage: getLanguageName(sourceLanguage),
61
- targetLanguage: getLanguageName(targetLanguage),
62
- text
63
- })
64
-
65
- export const replaceMarkerSymbol = (text: MessageContent) => {
66
- if (typeof text === 'string') {
67
- return text.replace(/^\s*🔤/, '').replace(/🔤\s*$/, '')
68
- }
69
- log.error(text)
70
- return ''
71
- }
package/src/split.ts DELETED
@@ -1,111 +0,0 @@
1
- import { RecursiveCharacterTextSplitter } from '@langchain/textsplitters'
2
- import type {
3
- RecursiveCharacterTextSplitterParams,
4
- SupportedTextSplitterLanguage
5
- } from '@langchain/textsplitters'
6
- import { AsyncTransform } from './asyncTransform.js'
7
-
8
- export type TextSplitterFormat = { format?: SupportedTextSplitterLanguage }
9
- export type TextSplitterParams = Partial<RecursiveCharacterTextSplitterParams> &
10
- TextSplitterFormat
11
- export type Separator = string | RegExp
12
- export type TextSplitterLike = { splitText(text: string): Promise<string[]> }
13
-
14
- const extensionsByFormat: Record<string, string[]> = {
15
- cpp: ['.h', '.c', '.cpp'],
16
- go: ['.go'],
17
- java: ['.java'],
18
- js: ['.js', '.cjs', '.mjs', '.jsx', '.ts', '.tsx'],
19
- php: ['.php', '.inc'],
20
- proto: ['.proto'],
21
- python: ['.py'],
22
- rst: ['.rst'],
23
- ruby: ['.rb', '.gem'],
24
- rust: ['.rs'],
25
- scala: ['.scala', '.sc'],
26
- swift: ['.swift'],
27
- markdown: ['.md', '.markdown', '.mkdn'],
28
- latex: ['.tex', '.latex', '.ltx'],
29
- html: ['.html', '.htm', '.xhtml', '.shtml', '.asp', '.aspx', '.jsp']
30
- }
31
-
32
- let formatByExtension: Record<string, SupportedTextSplitterLanguage> | undefined
33
-
34
- export const getFormatByExtension = (
35
- extname: string
36
- ): SupportedTextSplitterLanguage | undefined => {
37
- if (!formatByExtension) {
38
- formatByExtension = {}
39
- for (const [key, values] of Object.entries(extensionsByFormat)) {
40
- for (const ext of values) {
41
- formatByExtension[ext] = key as SupportedTextSplitterLanguage
42
- }
43
- }
44
- }
45
- return formatByExtension[extname]
46
- }
47
-
48
- export const recursiveChunkTextSplitter = (options?: TextSplitterParams) => {
49
- const { format = 'markdown', ...rest } = options || {}
50
- const separators = extensionsByFormat[format]
51
- ? RecursiveCharacterTextSplitter.getSeparatorsForLanguage(format)
52
- : ['\n\n', '\n', '.', '']
53
- return new RecursiveChunkTextSplitter({
54
- chunkSize: 1000,
55
- chunkOverlap: 0,
56
- separators,
57
- ...rest
58
- })
59
- }
60
-
61
- // @ts-expect-error override joinDocs to preserve whitespace; upstream marks it private
62
- export class RecursiveChunkTextSplitter extends RecursiveCharacterTextSplitter {
63
- protected override joinDocs(docs: string[], separator: string): string | null {
64
- const text = docs.join(separator)
65
- return text === '' ? null : text
66
- }
67
- }
68
-
69
- export class TextSplitterStream extends AsyncTransform {
70
- private readonly _textSplitter: TextSplitterLike
71
- private _documents: string[]
72
- private _buffer: string
73
-
74
- constructor(options?: { textSplitter?: TextSplitterLike }) {
75
- const { textSplitter } = options || {}
76
- super()
77
- this._textSplitter = textSplitter || recursiveChunkTextSplitter()
78
- this._documents = []
79
- this._buffer = ''
80
- }
81
-
82
- protected async _asyncTransform(
83
- chunk: Buffer | object,
84
- encoding: BufferEncoding | 'buffer'
85
- ) {
86
- const text = AsyncTransform.toString(chunk, encoding)
87
- this._buffer += text
88
-
89
- const documents = await this._textSplitter.splitText(this._buffer)
90
- this._buffer = documents.pop() ?? ''
91
- this._documents = this._documents.concat(documents)
92
-
93
- while (this._documents.length) {
94
- const document = this._documents.shift()
95
- if (document === undefined) break
96
- const wait = !this.push(document)
97
- if (wait) {
98
- return true
99
- }
100
- }
101
- return false
102
- }
103
-
104
- _flush(done: (error?: Error | null) => void) {
105
- for (const document of this._documents) {
106
- this.push(document)
107
- }
108
- this.push(this._buffer)
109
- done()
110
- }
111
- }
package/src/utils.ts DELETED
@@ -1,15 +0,0 @@
1
- export const isWhiteSpace = (text = '') => text.trim() === ''
2
-
3
- export const whiteSpace = (text = '') => {
4
- const head = /^\s{0,200}/.exec(text)?.[0] || ''
5
- let tail = ''
6
- if (head.length !== text.length) {
7
- tail = /\s{0,200}$/.exec(text)?.[0] || ''
8
- }
9
- return { head, tail }
10
- }
11
-
12
- export const preserveWhiteSpace = (inp: string, out: string) => {
13
- const { head, tail } = whiteSpace(inp)
14
- return head + out.trim() + tail
15
- }
package/tsconfig.json DELETED
@@ -1,19 +0,0 @@
1
- {
2
- "compilerOptions": {
3
- "target": "ES2022",
4
- "module": "NodeNext",
5
- "moduleResolution": "NodeNext",
6
- "rootDir": "src",
7
- "outDir": "dist",
8
- "declaration": true,
9
- "declarationMap": true,
10
- "sourceMap": true,
11
- "esModuleInterop": true,
12
- "resolveJsonModule": false,
13
- "strict": true,
14
- "skipLibCheck": true,
15
- "forceConsistentCasingInFileNames": true
16
- },
17
- "include": ["src/**/*.ts"],
18
- "exclude": ["node_modules", "dist"]
19
- }