@jterrazz/intelligence 3.0.0 → 3.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs
CHANGED
|
@@ -697,7 +697,7 @@ var ASCII_CTRL_RE = /[\u0000-\u0008\u000B\u000C\u000E-\u001F\u007F]/g;
|
|
|
697
697
|
var MULTIPLE_SPACES_RE = / {2,}/g;
|
|
698
698
|
var CR_RE = /\r\n?/g;
|
|
699
699
|
var CITATION_RE = / *\(oaicite:\d+\)\{index=\d+\}/g;
|
|
700
|
-
var EM_DASH_SEPARATOR_RE =
|
|
700
|
+
var EM_DASH_SEPARATOR_RE = /\s*[—–―‒]\s*/g;
|
|
701
701
|
var TYPOGRAPHY_REPLACEMENTS = [
|
|
702
702
|
{
|
|
703
703
|
pattern: /[\u2018\u2019\u201A]/g,
|
|
@@ -707,10 +707,6 @@ var TYPOGRAPHY_REPLACEMENTS = [
|
|
|
707
707
|
pattern: /[\u201C\u201D\u201E]/g,
|
|
708
708
|
replacement: '"'
|
|
709
709
|
},
|
|
710
|
-
{
|
|
711
|
-
pattern: /[\u2013\u2014]/g,
|
|
712
|
-
replacement: '-'
|
|
713
|
-
},
|
|
714
710
|
{
|
|
715
711
|
pattern: /\u2026/g,
|
|
716
712
|
replacement: '...'
|
|
@@ -80,10 +80,18 @@ describe('parseText', function() {
|
|
|
80
80
|
it('converts figure dash with spaces to comma', function() {
|
|
81
81
|
expect(parseText('hello ‒ world')).toBe('hello, world');
|
|
82
82
|
});
|
|
83
|
+
it('converts em dash without spaces to comma', function() {
|
|
84
|
+
expect(parseText('disparaître—ne laissant')).toBe('disparaître, ne laissant');
|
|
85
|
+
});
|
|
83
86
|
it('can be disabled via options', function() {
|
|
84
87
|
expect(parseText('hello — world', {
|
|
85
88
|
normalizeEmDashesToCommas: false
|
|
86
|
-
})).toBe('hello
|
|
89
|
+
})).toBe('hello — world');
|
|
90
|
+
});
|
|
91
|
+
it('preserves em dash when disabled (no spaces)', function() {
|
|
92
|
+
expect(parseText('word—word', {
|
|
93
|
+
normalizeEmDashesToCommas: false
|
|
94
|
+
})).toBe('word—word');
|
|
87
95
|
});
|
|
88
96
|
});
|
|
89
97
|
describe('space-like character normalization', function() {
|
|
@@ -110,11 +118,11 @@ describe('parseText', function() {
|
|
|
110
118
|
it('converts left double quote to straight quote', function() {
|
|
111
119
|
expect(parseText('\u201CHello\u201D')).toBe('"Hello"');
|
|
112
120
|
});
|
|
113
|
-
it('converts em dash to
|
|
114
|
-
expect(parseText('word\u2014word')).toBe('word
|
|
121
|
+
it('converts em dash to comma', function() {
|
|
122
|
+
expect(parseText('word\u2014word')).toBe('word, word');
|
|
115
123
|
});
|
|
116
|
-
it('converts en dash to
|
|
117
|
-
expect(parseText('2020\u20132021')).toBe('2020
|
|
124
|
+
it('converts en dash to comma', function() {
|
|
125
|
+
expect(parseText('2020\u20132021')).toBe('2020, 2021');
|
|
118
126
|
});
|
|
119
127
|
it('converts ellipsis to three dots', function() {
|
|
120
128
|
expect(parseText('wait\u2026')).toBe('wait...');
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../../../src/parsing/__tests__/parse-text.test.ts"],"sourcesContent":["import { describe, expect, it } from 'vitest';\n\nimport { parseText } from '../parse-text.js';\n\ndescribe('parseText', () => {\n describe('empty and basic input', () => {\n it('returns empty string for empty input', () => {\n expect(parseText('')).toBe('');\n });\n\n it('returns trimmed text for simple input', () => {\n expect(parseText(' hello world ')).toBe('hello world');\n });\n\n it('preserves newlines', () => {\n expect(parseText('hello\\nworld')).toBe('hello\\nworld');\n });\n });\n\n describe('BOM handling', () => {\n it('removes BOM character at start', () => {\n expect(parseText('\\uFEFFhello')).toBe('hello');\n });\n\n it('removes BOM in middle of text (via invisible char removal)', () => {\n expect(parseText('hello\\uFEFFworld')).toBe('helloworld');\n });\n });\n\n describe('line ending normalization', () => {\n it('converts CRLF to LF', () => {\n expect(parseText('hello\\r\\nworld')).toBe('hello\\nworld');\n });\n\n it('converts standalone CR to LF', () => {\n expect(parseText('hello\\rworld')).toBe('hello\\nworld');\n });\n });\n\n describe('AI citation removal', () => {\n it('removes oaicite markers', () => {\n expect(parseText('Some text (oaicite:0){index=0} more text')).toBe(\n 'Some text more text',\n );\n });\n\n it('removes multiple citation markers', () => {\n expect(parseText('Text (oaicite:1){index=1} and (oaicite:2){index=2} here')).toBe(\n 'Text and here',\n );\n });\n });\n\n describe('invisible character removal', () => {\n it('removes zero-width space', () => {\n expect(parseText('hello\\u200Bworld')).toBe('helloworld');\n });\n\n it('removes zero-width non-joiner', () => {\n expect(parseText('hello\\u200Cworld')).toBe('helloworld');\n });\n\n it('removes soft hyphen', () => {\n expect(parseText('hello\\u00ADworld')).toBe('helloworld');\n });\n\n it('removes direction marks', () => {\n expect(parseText('hello\\u200E\\u200Fworld')).toBe('helloworld');\n });\n\n it('removes word joiner', () => {\n expect(parseText('hello\\u2060world')).toBe('helloworld');\n });\n });\n\n describe('ASCII control character removal', () => {\n it('removes null character', () => {\n expect(parseText('hello\\x00world')).toBe('helloworld');\n });\n\n it('removes bell character', () => {\n expect(parseText('hello\\x07world')).toBe('helloworld');\n });\n\n it('removes delete character', () => {\n expect(parseText('hello\\x7Fworld')).toBe('helloworld');\n });\n\n it('preserves tab and newline', () => {\n expect(parseText('hello\\tworld\\n!')).toBe('hello\\tworld\\n!');\n });\n });\n\n describe('em/en dash normalization', () => {\n it('converts em dash with spaces to comma', () => {\n expect(parseText('hello — world')).toBe('hello, world');\n });\n\n it('converts en dash with spaces to comma', () => {\n expect(parseText('hello – world')).toBe('hello, world');\n });\n\n it('converts horizontal bar with spaces to comma', () => {\n expect(parseText('hello ― world')).toBe('hello, world');\n });\n\n it('converts figure dash with spaces to comma', () => {\n expect(parseText('hello ‒ world')).toBe('hello, world');\n });\n\n it('can be disabled via options', () => {\n expect(parseText('hello — world', { normalizeEmDashesToCommas: false })).toBe(\n 'hello - world',\n );\n });\n });\n\n describe('space-like character normalization', () => {\n it('converts non-breaking space to regular space', () => {\n expect(parseText('hello\\u00A0world')).toBe('hello world');\n });\n\n it('converts em space to regular space', () => {\n expect(parseText('hello\\u2003world')).toBe('hello world');\n });\n\n it('converts narrow no-break space to regular space', () => {\n expect(parseText('hello\\u202Fworld')).toBe('hello world');\n });\n\n it('converts ideographic space to regular space', () => {\n expect(parseText('hello\\u3000world')).toBe('hello world');\n });\n });\n\n describe('typography normalization', () => {\n it('converts left single quote to straight quote', () => {\n expect(parseText('it\\u2018s')).toBe(\"it's\");\n });\n\n it('converts right single quote to straight quote', () => {\n expect(parseText('it\\u2019s')).toBe(\"it's\");\n });\n\n it('converts left double quote to straight quote', () => {\n expect(parseText('\\u201CHello\\u201D')).toBe('\"Hello\"');\n });\n\n it('converts em dash to hyphen', () => {\n expect(parseText('word\\u2014word')).toBe('word-word');\n });\n\n it('converts en dash to hyphen', () => {\n expect(parseText('2020\\u20132021')).toBe('2020-2021');\n });\n\n it('converts ellipsis to three dots', () => {\n expect(parseText('wait\\u2026')).toBe('wait...');\n });\n\n it('converts bullet point to hyphen', () => {\n expect(parseText('\\u2022 item')).toBe('- item');\n });\n });\n\n describe('multiple space collapsing', () => {\n it('collapses multiple spaces to single space', () => {\n expect(parseText('hello world')).toBe('hello world');\n });\n\n it('trims leading and trailing spaces', () => {\n expect(parseText(' hello world ')).toBe('hello world');\n });\n\n it('can be disabled via options', () => {\n expect(parseText('hello world', { collapseSpaces: false })).toBe('hello world');\n });\n });\n\n describe('NFKC normalization', () => {\n it('normalizes fullwidth characters', () => {\n expect(parseText('\\uFF21\\uFF22\\uFF23')).toBe('ABC');\n });\n\n it('normalizes ligatures', () => {\n expect(parseText('\\uFB01le')).toBe('file');\n });\n });\n\n describe('combined scenarios', () => {\n it('handles AI-generated text with multiple issues', () => {\n const input =\n '\\uFEFF Hello\\u2019s world (oaicite:0){index=0} \\u2014 with\\u00A0spaces ';\n expect(parseText(input)).toBe(\"Hello's world, with spaces\");\n });\n\n it('handles markdown with smart quotes and dashes', () => {\n const input = '\\u201CThis is a quote\\u201D \\u2014 Author';\n expect(parseText(input)).toBe('\"This is a quote\", Author');\n });\n });\n});\n"],"names":["describe","expect","it","parseText","toBe","normalizeEmDashesToCommas","collapseSpaces","input"],"mappings":"AAAA,SAASA,QAAQ,EAAEC,MAAM,EAAEC,EAAE,QAAQ,SAAS;AAE9C,SAASC,SAAS,QAAQ,mBAAmB;AAE7CH,SAAS,aAAa;IAClBA,SAAS,yBAAyB;QAC9BE,GAAG,wCAAwC;YACvCD,OAAOE,UAAU,KAAKC,IAAI,CAAC;QAC/B;QAEAF,GAAG,yCAAyC;YACxCD,OAAOE,UAAU,oBAAoBC,IAAI,CAAC;QAC9C;QAEAF,GAAG,sBAAsB;YACrBD,OAAOE,UAAU,iBAAiBC,IAAI,CAAC;QAC3C;IACJ;IAEAJ,SAAS,gBAAgB;QACrBE,GAAG,kCAAkC;YACjCD,OAAOE,UAAU,gBAAgBC,IAAI,CAAC;QAC1C;QAEAF,GAAG,8DAA8D;YAC7DD,OAAOE,UAAU,qBAAqBC,IAAI,CAAC;QAC/C;IACJ;IAEAJ,SAAS,6BAA6B;QAClCE,GAAG,uBAAuB;YACtBD,OAAOE,UAAU,mBAAmBC,IAAI,CAAC;QAC7C;QAEAF,GAAG,gCAAgC;YAC/BD,OAAOE,UAAU,iBAAiBC,IAAI,CAAC;QAC3C;IACJ;IAEAJ,SAAS,uBAAuB;QAC5BE,GAAG,2BAA2B;YAC1BD,OAAOE,UAAU,6CAA6CC,IAAI,CAC9D;QAER;QAEAF,GAAG,qCAAqC;YACpCD,OAAOE,UAAU,4DAA4DC,IAAI,CAC7E;QAER;IACJ;IAEAJ,SAAS,+BAA+B;QACpCE,GAAG,4BAA4B;YAC3BD,OAAOE,UAAU,qBAAqBC,IAAI,CAAC;QAC/C;QAEAF,GAAG,iCAAiC;YAChCD,OAAOE,UAAU,qBAAqBC,IAAI,CAAC;QAC/C;QAEAF,GAAG,uBAAuB;YACtBD,OAAOE,UAAU,qBAAqBC,IAAI,CAAC;QAC/C;QAEAF,GAAG,2BAA2B;YAC1BD,OAAOE,UAAU,2BAA2BC,IAAI,CAAC;QACrD;QAEAF,GAAG,uBAAuB;YACtBD,OAAOE,UAAU,qBAAqBC,IAAI,CAAC;QAC/C;IACJ;IAEAJ,SAAS,mCAAmC;QACxCE,GAAG,0BAA0B;YACzBD,OAAOE,UAAU,mBAAmBC,IAAI,CAAC;QAC7C;QAEAF,GAAG,0BAA0B;YACzBD,OAAOE,UAAU,mBAAmBC,IAAI,CAAC;QAC7C;QAEAF,GAAG,4BAA4B;YAC3BD,OAAOE,UAAU,mBAAmBC,IAAI,CAAC;QAC7C;QAEAF,GAAG,6BAA6B;YAC5BD,OAAOE,UAAU,oBAAoBC,IAAI,CAAC;QAC9C;IACJ;IAEAJ,SAAS,4BAA4B;QACjCE,GAAG,yCAAyC;YACxCD,OAAOE,UAAU,kBAAkBC,IAAI,CAAC;QAC5C;QAEAF,GAAG,yCAAyC;YACxCD,OAAOE,UAAU,kBAAkBC,IAAI,CAAC;QAC5C;QAEAF,GAAG,gDAAgD;YAC/CD,OAAOE,UAAU,kBAAkBC,IAAI,CAAC;QAC5C;QAEAF,GAAG,6CAA6C;YAC5CD,OAAOE,UAAU,kBAAkBC,IAAI,CAAC;QAC5C;QAEAF,GAAG,+BAA+B;YAC9BD,OAAOE,UAAU,iBAAiB;gBAAEE,2BAA2B;YAAM,IAAID,IAAI,CACzE;QAER;IACJ;IAEAJ,SAAS,sCAAsC;QAC3CE,GAAG,gDAAgD;YAC/CD,OAAOE,UAAU,qBAAqBC,IAAI,CAAC;QAC/C;QAEAF,GAAG,sCAAsC;YACrCD,OAAOE,UAAU,qBAAqBC,IAAI,CAAC;QAC/C;QAEAF,GAAG,mDAAmD;YAClDD,OAAOE,UAAU,qBAAqBC,IAAI,CAAC;QAC/C;QAEAF,GAAG,+CAA+C;YAC9CD,OAAOE,UAAU,qBAAqBC,IAAI,CAAC;QAC/C;IACJ;IAEAJ,SAAS,4BAA4B;QACjCE,GAAG,gDAAgD;YAC/CD,OAAOE,UAAU,cAAcC,IAAI,CAAC;QACxC;QAEAF,GAAG,iDAAiD;YAChDD,OAAOE,UAAU,cAAcC,IAAI,CAAC;QACxC;QAEAF,GAAG,gDAAgD;YAC/CD,OAAOE,UAAU,sBAAsBC,IAAI,CAAC;QAChD;QAEAF,GAAG,8BAA8B;YAC7BD,OAAOE,UAAU,mBAAmBC,IAAI,CAAC;QAC7C;QAEAF,GAAG,8BAA8B;YAC7BD,OAAOE,UAAU,mBAAmBC,IAAI,CAAC;QAC7C;QAEAF,GAAG,mCAAmC;YAClCD,OAAOE,UAAU,eAAeC,IAAI,CAAC;QACzC;QAEAF,GAAG,mCAAmC;YAClCD,OAAOE,UAAU,gBAAgBC,IAAI,CAAC;QAC1C;IACJ;IAEAJ,SAAS,6BAA6B;QAClCE,GAAG,6CAA6C;YAC5CD,OAAOE,UAAU,mBAAmBC,IAAI,CAAC;QAC7C;QAEAF,GAAG,qCAAqC;YACpCD,OAAOE,UAAU,sBAAsBC,IAAI,CAAC;QAChD;QAEAF,GAAG,+BAA+B;YAC9BD,OAAOE,UAAU,kBAAkB;gBAAEG,gBAAgB;YAAM,IAAIF,IAAI,CAAC;QACxE;IACJ;IAEAJ,SAAS,sBAAsB;QAC3BE,GAAG,mCAAmC;YAClCD,OAAOE,UAAU,uBAAuBC,IAAI,CAAC;QACjD;QAEAF,GAAG,wBAAwB;YACvBD,OAAOE,UAAU,aAAaC,IAAI,CAAC;QACvC;IACJ;IAEAJ,SAAS,sBAAsB;QAC3BE,GAAG,kDAAkD;YACjD,IAAMK,QACF;YACJN,OAAOE,UAAUI,QAAQH,IAAI,CAAC;QAClC;QAEAF,GAAG,iDAAiD;YAChD,IAAMK,QAAQ;YACdN,OAAOE,UAAUI,QAAQH,IAAI,CAAC;QAClC;IACJ;AACJ"}
|
|
1
|
+
{"version":3,"sources":["../../../src/parsing/__tests__/parse-text.test.ts"],"sourcesContent":["import { describe, expect, it } from 'vitest';\n\nimport { parseText } from '../parse-text.js';\n\ndescribe('parseText', () => {\n describe('empty and basic input', () => {\n it('returns empty string for empty input', () => {\n expect(parseText('')).toBe('');\n });\n\n it('returns trimmed text for simple input', () => {\n expect(parseText(' hello world ')).toBe('hello world');\n });\n\n it('preserves newlines', () => {\n expect(parseText('hello\\nworld')).toBe('hello\\nworld');\n });\n });\n\n describe('BOM handling', () => {\n it('removes BOM character at start', () => {\n expect(parseText('\\uFEFFhello')).toBe('hello');\n });\n\n it('removes BOM in middle of text (via invisible char removal)', () => {\n expect(parseText('hello\\uFEFFworld')).toBe('helloworld');\n });\n });\n\n describe('line ending normalization', () => {\n it('converts CRLF to LF', () => {\n expect(parseText('hello\\r\\nworld')).toBe('hello\\nworld');\n });\n\n it('converts standalone CR to LF', () => {\n expect(parseText('hello\\rworld')).toBe('hello\\nworld');\n });\n });\n\n describe('AI citation removal', () => {\n it('removes oaicite markers', () => {\n expect(parseText('Some text (oaicite:0){index=0} more text')).toBe(\n 'Some text more text',\n );\n });\n\n it('removes multiple citation markers', () => {\n expect(parseText('Text (oaicite:1){index=1} and (oaicite:2){index=2} here')).toBe(\n 'Text and here',\n );\n });\n });\n\n describe('invisible character removal', () => {\n it('removes zero-width space', () => {\n expect(parseText('hello\\u200Bworld')).toBe('helloworld');\n });\n\n it('removes zero-width non-joiner', () => {\n expect(parseText('hello\\u200Cworld')).toBe('helloworld');\n });\n\n it('removes soft hyphen', () => {\n expect(parseText('hello\\u00ADworld')).toBe('helloworld');\n });\n\n it('removes direction marks', () => {\n expect(parseText('hello\\u200E\\u200Fworld')).toBe('helloworld');\n });\n\n it('removes word joiner', () => {\n expect(parseText('hello\\u2060world')).toBe('helloworld');\n });\n });\n\n describe('ASCII control character removal', () => {\n it('removes null character', () => {\n expect(parseText('hello\\x00world')).toBe('helloworld');\n });\n\n it('removes bell character', () => {\n expect(parseText('hello\\x07world')).toBe('helloworld');\n });\n\n it('removes delete character', () => {\n expect(parseText('hello\\x7Fworld')).toBe('helloworld');\n });\n\n it('preserves tab and newline', () => {\n expect(parseText('hello\\tworld\\n!')).toBe('hello\\tworld\\n!');\n });\n });\n\n describe('em/en dash normalization', () => {\n it('converts em dash with spaces to comma', () => {\n expect(parseText('hello — world')).toBe('hello, world');\n });\n\n it('converts en dash with spaces to comma', () => {\n expect(parseText('hello – world')).toBe('hello, world');\n });\n\n it('converts horizontal bar with spaces to comma', () => {\n expect(parseText('hello ― world')).toBe('hello, world');\n });\n\n it('converts figure dash with spaces to comma', () => {\n expect(parseText('hello ‒ world')).toBe('hello, world');\n });\n\n it('converts em dash without spaces to comma', () => {\n expect(parseText('disparaître—ne laissant')).toBe('disparaître, ne laissant');\n });\n\n it('can be disabled via options', () => {\n expect(parseText('hello — world', { normalizeEmDashesToCommas: false })).toBe(\n 'hello — world',\n );\n });\n\n it('preserves em dash when disabled (no spaces)', () => {\n expect(parseText('word—word', { normalizeEmDashesToCommas: false })).toBe('word—word');\n });\n });\n\n describe('space-like character normalization', () => {\n it('converts non-breaking space to regular space', () => {\n expect(parseText('hello\\u00A0world')).toBe('hello world');\n });\n\n it('converts em space to regular space', () => {\n expect(parseText('hello\\u2003world')).toBe('hello world');\n });\n\n it('converts narrow no-break space to regular space', () => {\n expect(parseText('hello\\u202Fworld')).toBe('hello world');\n });\n\n it('converts ideographic space to regular space', () => {\n expect(parseText('hello\\u3000world')).toBe('hello world');\n });\n });\n\n describe('typography normalization', () => {\n it('converts left single quote to straight quote', () => {\n expect(parseText('it\\u2018s')).toBe(\"it's\");\n });\n\n it('converts right single quote to straight quote', () => {\n expect(parseText('it\\u2019s')).toBe(\"it's\");\n });\n\n it('converts left double quote to straight quote', () => {\n expect(parseText('\\u201CHello\\u201D')).toBe('\"Hello\"');\n });\n\n it('converts em dash to comma', () => {\n expect(parseText('word\\u2014word')).toBe('word, word');\n });\n\n it('converts en dash to comma', () => {\n expect(parseText('2020\\u20132021')).toBe('2020, 2021');\n });\n\n it('converts ellipsis to three dots', () => {\n expect(parseText('wait\\u2026')).toBe('wait...');\n });\n\n it('converts bullet point to hyphen', () => {\n expect(parseText('\\u2022 item')).toBe('- item');\n });\n });\n\n describe('multiple space collapsing', () => {\n it('collapses multiple spaces to single space', () => {\n expect(parseText('hello world')).toBe('hello world');\n });\n\n it('trims leading and trailing spaces', () => {\n expect(parseText(' hello world ')).toBe('hello world');\n });\n\n it('can be disabled via options', () => {\n expect(parseText('hello world', { collapseSpaces: false })).toBe('hello world');\n });\n });\n\n describe('NFKC normalization', () => {\n it('normalizes fullwidth characters', () => {\n expect(parseText('\\uFF21\\uFF22\\uFF23')).toBe('ABC');\n });\n\n it('normalizes ligatures', () => {\n expect(parseText('\\uFB01le')).toBe('file');\n });\n });\n\n describe('combined scenarios', () => {\n it('handles AI-generated text with multiple issues', () => {\n const input =\n '\\uFEFF Hello\\u2019s world (oaicite:0){index=0} \\u2014 with\\u00A0spaces ';\n expect(parseText(input)).toBe(\"Hello's world, with spaces\");\n });\n\n it('handles markdown with smart quotes and dashes', () => {\n const input = '\\u201CThis is a quote\\u201D \\u2014 Author';\n expect(parseText(input)).toBe('\"This is a quote\", Author');\n });\n });\n});\n"],"names":["describe","expect","it","parseText","toBe","normalizeEmDashesToCommas","collapseSpaces","input"],"mappings":"AAAA,SAASA,QAAQ,EAAEC,MAAM,EAAEC,EAAE,QAAQ,SAAS;AAE9C,SAASC,SAAS,QAAQ,mBAAmB;AAE7CH,SAAS,aAAa;IAClBA,SAAS,yBAAyB;QAC9BE,GAAG,wCAAwC;YACvCD,OAAOE,UAAU,KAAKC,IAAI,CAAC;QAC/B;QAEAF,GAAG,yCAAyC;YACxCD,OAAOE,UAAU,oBAAoBC,IAAI,CAAC;QAC9C;QAEAF,GAAG,sBAAsB;YACrBD,OAAOE,UAAU,iBAAiBC,IAAI,CAAC;QAC3C;IACJ;IAEAJ,SAAS,gBAAgB;QACrBE,GAAG,kCAAkC;YACjCD,OAAOE,UAAU,gBAAgBC,IAAI,CAAC;QAC1C;QAEAF,GAAG,8DAA8D;YAC7DD,OAAOE,UAAU,qBAAqBC,IAAI,CAAC;QAC/C;IACJ;IAEAJ,SAAS,6BAA6B;QAClCE,GAAG,uBAAuB;YACtBD,OAAOE,UAAU,mBAAmBC,IAAI,CAAC;QAC7C;QAEAF,GAAG,gCAAgC;YAC/BD,OAAOE,UAAU,iBAAiBC,IAAI,CAAC;QAC3C;IACJ;IAEAJ,SAAS,uBAAuB;QAC5BE,GAAG,2BAA2B;YAC1BD,OAAOE,UAAU,6CAA6CC,IAAI,CAC9D;QAER;QAEAF,GAAG,qCAAqC;YACpCD,OAAOE,UAAU,4DAA4DC,IAAI,CAC7E;QAER;IACJ;IAEAJ,SAAS,+BAA+B;QACpCE,GAAG,4BAA4B;YAC3BD,OAAOE,UAAU,qBAAqBC,IAAI,CAAC;QAC/C;QAEAF,GAAG,iCAAiC;YAChCD,OAAOE,UAAU,qBAAqBC,IAAI,CAAC;QAC/C;QAEAF,GAAG,uBAAuB;YACtBD,OAAOE,UAAU,qBAAqBC,IAAI,CAAC;QAC/C;QAEAF,GAAG,2BAA2B;YAC1BD,OAAOE,UAAU,2BAA2BC,IAAI,CAAC;QACrD;QAEAF,GAAG,uBAAuB;YACtBD,OAAOE,UAAU,qBAAqBC,IAAI,CAAC;QAC/C;IACJ;IAEAJ,SAAS,mCAAmC;QACxCE,GAAG,0BAA0B;YACzBD,OAAOE,UAAU,mBAAmBC,IAAI,CAAC;QAC7C;QAEAF,GAAG,0BAA0B;YACzBD,OAAOE,UAAU,mBAAmBC,IAAI,CAAC;QAC7C;QAEAF,GAAG,4BAA4B;YAC3BD,OAAOE,UAAU,mBAAmBC,IAAI,CAAC;QAC7C;QAEAF,GAAG,6BAA6B;YAC5BD,OAAOE,UAAU,oBAAoBC,IAAI,CAAC;QAC9C;IACJ;IAEAJ,SAAS,4BAA4B;QACjCE,GAAG,yCAAyC;YACxCD,OAAOE,UAAU,kBAAkBC,IAAI,CAAC;QAC5C;QAEAF,GAAG,yCAAyC;YACxCD,OAAOE,UAAU,kBAAkBC,IAAI,CAAC;QAC5C;QAEAF,GAAG,gDAAgD;YAC/CD,OAAOE,UAAU,kBAAkBC,IAAI,CAAC;QAC5C;QAEAF,GAAG,6CAA6C;YAC5CD,OAAOE,UAAU,kBAAkBC,IAAI,CAAC;QAC5C;QAEAF,GAAG,4CAA4C;YAC3CD,OAAOE,UAAU,4BAA4BC,IAAI,CAAC;QACtD;QAEAF,GAAG,+BAA+B;YAC9BD,OAAOE,UAAU,iBAAiB;gBAAEE,2BAA2B;YAAM,IAAID,IAAI,CACzE;QAER;QAEAF,GAAG,+CAA+C;YAC9CD,OAAOE,UAAU,aAAa;gBAAEE,2BAA2B;YAAM,IAAID,IAAI,CAAC;QAC9E;IACJ;IAEAJ,SAAS,sCAAsC;QAC3CE,GAAG,gDAAgD;YAC/CD,OAAOE,UAAU,qBAAqBC,IAAI,CAAC;QAC/C;QAEAF,GAAG,sCAAsC;YACrCD,OAAOE,UAAU,qBAAqBC,IAAI,CAAC;QAC/C;QAEAF,GAAG,mDAAmD;YAClDD,OAAOE,UAAU,qBAAqBC,IAAI,CAAC;QAC/C;QAEAF,GAAG,+CAA+C;YAC9CD,OAAOE,UAAU,qBAAqBC,IAAI,CAAC;QAC/C;IACJ;IAEAJ,SAAS,4BAA4B;QACjCE,GAAG,gDAAgD;YAC/CD,OAAOE,UAAU,cAAcC,IAAI,CAAC;QACxC;QAEAF,GAAG,iDAAiD;YAChDD,OAAOE,UAAU,cAAcC,IAAI,CAAC;QACxC;QAEAF,GAAG,gDAAgD;YAC/CD,OAAOE,UAAU,sBAAsBC,IAAI,CAAC;QAChD;QAEAF,GAAG,6BAA6B;YAC5BD,OAAOE,UAAU,mBAAmBC,IAAI,CAAC;QAC7C;QAEAF,GAAG,6BAA6B;YAC5BD,OAAOE,UAAU,mBAAmBC,IAAI,CAAC;QAC7C;QAEAF,GAAG,mCAAmC;YAClCD,OAAOE,UAAU,eAAeC,IAAI,CAAC;QACzC;QAEAF,GAAG,mCAAmC;YAClCD,OAAOE,UAAU,gBAAgBC,IAAI,CAAC;QAC1C;IACJ;IAEAJ,SAAS,6BAA6B;QAClCE,GAAG,6CAA6C;YAC5CD,OAAOE,UAAU,mBAAmBC,IAAI,CAAC;QAC7C;QAEAF,GAAG,qCAAqC;YACpCD,OAAOE,UAAU,sBAAsBC,IAAI,CAAC;QAChD;QAEAF,GAAG,+BAA+B;YAC9BD,OAAOE,UAAU,kBAAkB;gBAAEG,gBAAgB;YAAM,IAAIF,IAAI,CAAC;QACxE;IACJ;IAEAJ,SAAS,sBAAsB;QAC3BE,GAAG,mCAAmC;YAClCD,OAAOE,UAAU,uBAAuBC,IAAI,CAAC;QACjD;QAEAF,GAAG,wBAAwB;YACvBD,OAAOE,UAAU,aAAaC,IAAI,CAAC;QACvC;IACJ;IAEAJ,SAAS,sBAAsB;QAC3BE,GAAG,kDAAkD;YACjD,IAAMK,QACF;YACJN,OAAOE,UAAUI,QAAQH,IAAI,CAAC;QAClC;QAEAF,GAAG,iDAAiD;YAChD,IAAMK,QAAQ;YACdN,OAAOE,UAAUI,QAAQH,IAAI,CAAC;QAClC;IACJ;AACJ"}
|
|
@@ -5,7 +5,7 @@ var ASCII_CTRL_RE = /[\u0000-\u0008\u000B\u000C\u000E-\u001F\u007F]/g;
|
|
|
5
5
|
var MULTIPLE_SPACES_RE = / {2,}/g;
|
|
6
6
|
var CR_RE = /\r\n?/g;
|
|
7
7
|
var CITATION_RE = / *\(oaicite:\d+\)\{index=\d+\}/g;
|
|
8
|
-
var EM_DASH_SEPARATOR_RE =
|
|
8
|
+
var EM_DASH_SEPARATOR_RE = /\s*[—–―‒]\s*/g;
|
|
9
9
|
var TYPOGRAPHY_REPLACEMENTS = [
|
|
10
10
|
{
|
|
11
11
|
pattern: /[\u2018\u2019\u201A]/g,
|
|
@@ -15,10 +15,6 @@ var TYPOGRAPHY_REPLACEMENTS = [
|
|
|
15
15
|
pattern: /[\u201C\u201D\u201E]/g,
|
|
16
16
|
replacement: '"'
|
|
17
17
|
},
|
|
18
|
-
{
|
|
19
|
-
pattern: /[\u2013\u2014]/g,
|
|
20
|
-
replacement: '-'
|
|
21
|
-
},
|
|
22
18
|
{
|
|
23
19
|
pattern: /\u2026/g,
|
|
24
20
|
replacement: '...'
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../../src/parsing/parse-text.ts"],"sourcesContent":["const INVISIBLE_CHARS_RE =\n /[\\u00AD\\u180E\\u200B-\\u200C\\u200E-\\u200F\\u202A-\\u202E\\u2060-\\u2064\\u2066-\\u2069\\uFEFF]/g;\n\n/* eslint-disable no-control-regex -- intentionally matching control characters */\n// biome-ignore lint/suspicious/noControlCharactersInRegex: intentionally matching control characters for sanitization\nconst ASCII_CTRL_RE = /[\\u0000-\\u0008\\u000B\\u000C\\u000E-\\u001F\\u007F]/g;\n/* eslint-enable no-control-regex */\n\nconst SPACE_LIKE_RE = /[\\u00A0\\u1680\\u2000-\\u200A\\u202F\\u205F\\u3000]/g;\nconst MULTIPLE_SPACES_RE = / {2,}/g;\nconst CR_RE = /\\r\\n?/g;\nconst CITATION_RE = / *\\(oaicite:\\d+\\)\\{index=\\d+\\}/g;\nconst EM_DASH_SEPARATOR_RE =
|
|
1
|
+
{"version":3,"sources":["../../src/parsing/parse-text.ts"],"sourcesContent":["const INVISIBLE_CHARS_RE =\n /[\\u00AD\\u180E\\u200B-\\u200C\\u200E-\\u200F\\u202A-\\u202E\\u2060-\\u2064\\u2066-\\u2069\\uFEFF]/g;\n\n/* eslint-disable no-control-regex -- intentionally matching control characters */\n// biome-ignore lint/suspicious/noControlCharactersInRegex: intentionally matching control characters for sanitization\nconst ASCII_CTRL_RE = /[\\u0000-\\u0008\\u000B\\u000C\\u000E-\\u001F\\u007F]/g;\n/* eslint-enable no-control-regex */\n\nconst SPACE_LIKE_RE = /[\\u00A0\\u1680\\u2000-\\u200A\\u202F\\u205F\\u3000]/g;\nconst MULTIPLE_SPACES_RE = / {2,}/g;\nconst CR_RE = /\\r\\n?/g;\nconst CITATION_RE = / *\\(oaicite:\\d+\\)\\{index=\\d+\\}/g;\nconst EM_DASH_SEPARATOR_RE = /\\s*[—–―‒]\\s*/g;\n\nconst TYPOGRAPHY_REPLACEMENTS: Array<{ pattern: RegExp; replacement: string }> = [\n { pattern: /[\\u2018\\u2019\\u201A]/g, replacement: \"'\" },\n { pattern: /[\\u201C\\u201D\\u201E]/g, replacement: '\"' },\n { pattern: /\\u2026/g, replacement: '...' },\n { pattern: /[\\u2022\\u25AA-\\u25AB\\u25B8-\\u25B9\\u25CF]/g, replacement: '-' },\n];\n\nexport interface ParseTextOptions {\n /** Collapse multiple spaces into one (default: true) */\n collapseSpaces?: boolean;\n /** Convert em/en dashes with spaces to commas (default: true) */\n normalizeEmDashesToCommas?: boolean;\n}\n\n/**\n * Parses and sanitizes text by removing AI artifacts and normalizing typography.\n *\n * @param text - The text to parse\n * @param options - Parsing options\n * @returns The cleaned text\n */\nexport function parseText(text: string, options: ParseTextOptions = {}): string {\n const { normalizeEmDashesToCommas = true, collapseSpaces = true } = options;\n\n if (!text) return '';\n\n let result = text;\n\n if (result.charCodeAt(0) === 0xfeff) {\n result = result.slice(1);\n }\n\n result = result.replace(CR_RE, '\\n');\n result = result.replace(CITATION_RE, '');\n result = result.normalize('NFKC');\n result = result.replace(INVISIBLE_CHARS_RE, '');\n result = result.replace(ASCII_CTRL_RE, '');\n\n if (normalizeEmDashesToCommas) {\n result = result.replace(EM_DASH_SEPARATOR_RE, ', ');\n }\n\n result = result.replace(SPACE_LIKE_RE, ' ');\n\n for (const { pattern, replacement } of TYPOGRAPHY_REPLACEMENTS) {\n result = result.replace(pattern, replacement);\n }\n\n if (collapseSpaces) {\n result = result.replace(MULTIPLE_SPACES_RE, ' ').trim();\n }\n\n return result;\n}\n"],"names":["INVISIBLE_CHARS_RE","ASCII_CTRL_RE","SPACE_LIKE_RE","MULTIPLE_SPACES_RE","CR_RE","CITATION_RE","EM_DASH_SEPARATOR_RE","TYPOGRAPHY_REPLACEMENTS","pattern","replacement","parseText","text","options","normalizeEmDashesToCommas","collapseSpaces","result","charCodeAt","slice","replace","normalize","trim"],"mappings":"AAAA,IAAMA,qBACF;AAEJ,gFAAgF,GAChF,sHAAsH;AACtH,IAAMC,gBAAgB;AACtB,kCAAkC,GAElC,IAAMC,gBAAgB;AACtB,IAAMC,qBAAqB;AAC3B,IAAMC,QAAQ;AACd,IAAMC,cAAc;AACpB,IAAMC,uBAAuB;AAE7B,IAAMC,0BAA2E;IAC7E;QAAEC,SAAS;QAAyBC,aAAa;IAAI;IACrD;QAAED,SAAS;QAAyBC,aAAa;IAAI;IACrD;QAAED,SAAS;QAAWC,aAAa;IAAM;IACzC;QAAED,SAAS;QAA6CC,aAAa;IAAI;CAC5E;AASD;;;;;;CAMC,GACD,OAAO,SAASC,UAAUC,IAAY;QAAEC,UAAAA,iEAA4B,CAAC;IACjE,yCAAoEA,QAA5DC,2BAAAA,4EAA4B,qEAAgCD,QAA1BE,gBAAAA,sDAAiB;IAE3D,IAAI,CAACH,MAAM,OAAO;IAElB,IAAII,SAASJ;IAEb,IAAII,OAAOC,UAAU,CAAC,OAAO,QAAQ;QACjCD,SAASA,OAAOE,KAAK,CAAC;IAC1B;IAEAF,SAASA,OAAOG,OAAO,CAACd,OAAO;IAC/BW,SAASA,OAAOG,OAAO,CAACb,aAAa;IACrCU,SAASA,OAAOI,SAAS,CAAC;IAC1BJ,SAASA,OAAOG,OAAO,CAAClB,oBAAoB;IAC5Ce,SAASA,OAAOG,OAAO,CAACjB,eAAe;IAEvC,IAAIY,2BAA2B;QAC3BE,SAASA,OAAOG,OAAO,CAACZ,sBAAsB;IAClD;IAEAS,SAASA,OAAOG,OAAO,CAAChB,eAAe;QAElC,kCAAA,2BAAA;;QAAL,QAAK,YAAkCK,4CAAlC,SAAA,6BAAA,QAAA,yBAAA,iCAA2D;YAA3D,kBAAA,aAAQC,sBAAAA,SAASC,0BAAAA;YAClBM,SAASA,OAAOG,OAAO,CAACV,SAASC;QACrC;;QAFK;QAAA;;;iBAAA,6BAAA;gBAAA;;;gBAAA;sBAAA;;;;IAIL,IAAIK,gBAAgB;QAChBC,SAASA,OAAOG,OAAO,CAACf,oBAAoB,KAAKiB,IAAI;IACzD;IAEA,OAAOL;AACX"}
|