axyseo 2.1.8 → 2.1.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/build/helpers/getLanguageResearcher.js +71 -0
  2. package/build/helpers/getLanguageResearcher.js.map +1 -0
  3. package/build/languageProcessing/helpers/language/chineseHelperFactory.js +162 -0
  4. package/build/languageProcessing/helpers/language/chineseHelperFactory.js.map +1 -0
  5. package/build/languageProcessing/helpers/language/isChineseText.js +17 -0
  6. package/build/languageProcessing/helpers/language/isChineseText.js.map +1 -0
  7. package/build/languageProcessing/helpers/match/matchTextWithWord.js +1 -1
  8. package/build/languageProcessing/helpers/match/matchTextWithWord.js.map +1 -1
  9. package/build/languageProcessing/helpers/word/getWords.js +22 -14
  10. package/build/languageProcessing/helpers/word/getWords.js.map +1 -1
  11. package/build/languageProcessing/languages/zh/Researcher.js +41 -0
  12. package/build/languageProcessing/languages/zh/Researcher.js.map +1 -0
  13. package/build/languageProcessing/languages/zh/config/functionWords.js +40 -0
  14. package/build/languageProcessing/languages/zh/config/functionWords.js.map +1 -0
  15. package/build/languageProcessing/languages/zh/helpers/getSentences.js +42 -0
  16. package/build/languageProcessing/languages/zh/helpers/getSentences.js.map +1 -0
  17. package/build/languageProcessing/languages/zh/helpers/matchTextWithWord.js +35 -0
  18. package/build/languageProcessing/languages/zh/helpers/matchTextWithWord.js.map +1 -0
  19. package/build/languageProcessing/languages/zh/helpers/splitIntoTokensCustom.js +41 -0
  20. package/build/languageProcessing/languages/zh/helpers/splitIntoTokensCustom.js.map +1 -0
  21. package/build/languageProcessing/researches/findKeywordInFirstParagraph.js +23 -1
  22. package/build/languageProcessing/researches/findKeywordInFirstParagraph.js.map +1 -1
  23. package/build/languageProcessing/researches/getAnchorsWithKeyphrase.js +22 -17
  24. package/build/languageProcessing/researches/getAnchorsWithKeyphrase.js.map +1 -1
  25. package/build/languageProcessing/researches/getParagraphs.js +13 -4
  26. package/build/languageProcessing/researches/getParagraphs.js.map +1 -1
  27. package/build/languageProcessing/researches/keywordCount.js +29 -1
  28. package/build/languageProcessing/researches/keywordCount.js.map +1 -1
  29. package/build/languageProcessing/researches/keywordCountInUrl.js +150 -5
  30. package/build/languageProcessing/researches/keywordCountInUrl.js.map +1 -1
  31. package/build/languageProcessing/researches/metaDescriptionKeyword.js +16 -4
  32. package/build/languageProcessing/researches/metaDescriptionKeyword.js.map +1 -1
  33. package/build/scoring/assessments/seo/IntroductionKeywordAssessment.js +5 -1
  34. package/build/scoring/assessments/seo/IntroductionKeywordAssessment.js.map +1 -1
  35. package/build/scoring/assessments/seo/KeywordDensityAssessment.js.map +1 -1
  36. package/build/scoring/assessments/seo/UrlKeywordAssessment.js +5 -1
  37. package/build/scoring/assessments/seo/UrlKeywordAssessment.js.map +1 -1
  38. package/package.json +1 -1
@@ -0,0 +1,71 @@
1
+ import { languageProcessing } from "../index.js";
2
+ const {
3
+ AbstractResearcher
4
+ } = languageProcessing;
5
+
6
+ // Import language-specific researchers
7
+ import EnglishResearcher from "../languageProcessing/languages/en/Researcher.js";
8
+ import ChineseResearcher from "../languageProcessing/languages/zh/Researcher.js";
9
+ import JapaneseResearcher from "../languageProcessing/languages/ja/Researcher.js";
10
+ import DefaultResearcher from "../languageProcessing/languages/_default/Researcher.js";
11
+ import isChineseText from "../languageProcessing/helpers/language/isChineseText";
12
+
13
+ /**
14
+ * Detects the language of the given text and returns the appropriate researcher.
15
+ *
16
+ * @param {Paper} paper The paper object containing the text to analyze.
17
+ * @param {string} [locale] Optional locale override.
18
+ *
19
+ * @returns {AbstractResearcher} The appropriate researcher for the detected language.
20
+ */
21
+ export default function getLanguageResearcher(paper, locale = null) {
22
+ // Use provided locale or try to get from paper
23
+ const paperLocale = locale || (paper && paper.getLocale ? paper.getLocale() : '');
24
+
25
+ // Get text from paper for language detection
26
+ let text = '';
27
+ if (paper) {
28
+ const keyword = paper.getKeyword ? paper.getKeyword() : '';
29
+ const title = paper.getTitle ? paper.getTitle() : '';
30
+ const content = paper.getText ? paper.getText() : '';
31
+ text = [keyword, title, content].join(' ');
32
+ }
33
+
34
+ // Explicit locale mapping
35
+ if (paperLocale) {
36
+ const languageCode = paperLocale.split('-')[0].toLowerCase();
37
+ switch (languageCode) {
38
+ case 'zh':
39
+ case 'zh-cn':
40
+ case 'zh-tw':
41
+ return new ChineseResearcher(paper);
42
+ case 'ja':
43
+ return new JapaneseResearcher(paper);
44
+ case 'en':
45
+ return new EnglishResearcher(paper);
46
+ default:
47
+ // Continue to text-based detection
48
+ break;
49
+ }
50
+ }
51
+
52
+ // Text-based language detection
53
+ if (isChineseText(text)) {
54
+ return new ChineseResearcher(paper);
55
+ }
56
+
57
+ // Check for Japanese characters
58
+ const japaneseRegex = /[\u3040-\u309f\u30a0-\u30ff\u4e00-\u9faf]/;
59
+ if (japaneseRegex.test(text)) {
60
+ return new JapaneseResearcher(paper);
61
+ }
62
+
63
+ // Default to English or provided locale
64
+ if (paperLocale === 'en' || !paperLocale) {
65
+ return new EnglishResearcher(paper);
66
+ }
67
+
68
+ // Fallback to default researcher
69
+ return new DefaultResearcher(paper);
70
+ }
71
+ //# sourceMappingURL=getLanguageResearcher.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"getLanguageResearcher.js","names":["languageProcessing","AbstractResearcher","EnglishResearcher","ChineseResearcher","JapaneseResearcher","DefaultResearcher","isChineseText","getLanguageResearcher","paper","locale","paperLocale","getLocale","text","keyword","getKeyword","title","getTitle","content","getText","join","languageCode","split","toLowerCase","japaneseRegex","test"],"sources":["../../src/helpers/getLanguageResearcher.js"],"sourcesContent":["import {languageProcessing} from '@axyseo/index.js';\nconst {AbstractResearcher} = languageProcessing;\n\n// Import language-specific researchers\nimport EnglishResearcher from '../languageProcessing/languages/en/Researcher.js';\nimport ChineseResearcher from '../languageProcessing/languages/zh/Researcher.js';\nimport JapaneseResearcher from '../languageProcessing/languages/ja/Researcher.js';\nimport DefaultResearcher from '../languageProcessing/languages/_default/Researcher.js';\n\nimport isChineseText from '../languageProcessing/helpers/language/isChineseText';\n\n/**\n * Detects the language of the given text and returns the appropriate researcher.\n *\n * @param {Paper} paper The paper object containing the text to analyze.\n * @param {string} [locale] Optional locale override.\n *\n * @returns {AbstractResearcher} The appropriate researcher for the detected language.\n */\nexport default function getLanguageResearcher(paper, locale = null) {\n // Use provided locale or try to get from paper\n const paperLocale = locale || (paper && paper.getLocale ? paper.getLocale() : '');\n\n // Get text from paper for language detection\n let text = '';\n if (paper) {\n const keyword = paper.getKeyword ? paper.getKeyword() : '';\n const title = paper.getTitle ? paper.getTitle() : '';\n const content = paper.getText ? paper.getText() : '';\n text = [keyword, title, content].join(' ');\n }\n\n // Explicit locale mapping\n if (paperLocale) {\n const languageCode = paperLocale.split('-')[0].toLowerCase();\n\n switch (languageCode) {\n case 'zh':\n case 'zh-cn':\n case 'zh-tw':\n return new ChineseResearcher(paper);\n case 'ja':\n return new JapaneseResearcher(paper);\n case 'en':\n return new EnglishResearcher(paper);\n default:\n // Continue to text-based detection\n break;\n }\n }\n\n // Text-based language detection\n if (isChineseText(text)) {\n return new ChineseResearcher(paper);\n }\n\n // Check for Japanese characters\n const japaneseRegex = /[\\u3040-\\u309f\\u30a0-\\u30ff\\u4e00-\\u9faf]/;\n if (japaneseRegex.test(text)) {\n return new JapaneseResearcher(paper);\n }\n\n // Default to English or provided locale\n if (paperLocale === 'en' || !paperLocale) {\n return new EnglishResearcher(paper);\n }\n\n // Fallback to default researcher\n return new DefaultResearcher(paper);\n}\n"],"mappings":"AAAA,SAAQA,kBAAkB;AAC1B,MAAM;EAACC;AAAkB,CAAC,GAAGD,kBAAkB;;AAE/C;AACA,OAAOE,iBAAiB;AACxB,OAAOC,iBAAiB;AACxB,OAAOC,kBAAkB;AACzB,OAAOC,iBAAiB;AAExB,OAAOC,aAAa;;AAEpB;AACA;AACA;AACA;AACA;AACA;AACA;AACA;AACA,eAAe,SAASC,qBAAqBA,CAACC,KAAK,EAAEC,MAAM,GAAG,IAAI,EAAE;EAClE;EACA,MAAMC,WAAW,GAAGD,MAAM,KAAKD,KAAK,IAAIA,KAAK,CAACG,SAAS,GAAGH,KAAK,CAACG,SAAS,CAAC,CAAC,GAAG,EAAE,CAAC;;EAEjF;EACA,IAAIC,IAAI,GAAG,EAAE;EACb,IAAIJ,KAAK,EAAE;IACT,MAAMK,OAAO,GAAGL,KAAK,CAACM,UAAU,GAAGN,KAAK,CAACM,UAAU,CAAC,CAAC,GAAG,EAAE;IAC1D,MAAMC,KAAK,GAAGP,KAAK,CAACQ,QAAQ,GAAGR,KAAK,CAACQ,QAAQ,CAAC,CAAC,GAAG,EAAE;IACpD,MAAMC,OAAO,GAAGT,KAAK,CAACU,OAAO,GAAGV,KAAK,CAACU,OAAO,CAAC,CAAC,GAAG,EAAE;IACpDN,IAAI,GAAG,CAACC,OAAO,EAAEE,KAAK,EAAEE,OAAO,CAAC,CAACE,IAAI,CAAC,GAAG,CAAC;EAC5C;;EAEA;EACA,IAAIT,WAAW,EAAE;IACf,MAAMU,YAAY,GAAGV,WAAW,CAACW,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAACC,WAAW,CAAC,CAAC;IAE5D,QAAQF,YAAY;MAClB,KAAK,IAAI;MACT,KAAK,OAAO;MACZ,KAAK,OAAO;QACV,OAAO,IAAIjB,iBAAiB,CAACK,KAAK,CAAC;MACrC,KAAK,IAAI;QACP,OAAO,IAAIJ,kBAAkB,CAACI,KAAK,CAAC;MACtC,KAAK,IAAI;QACP,OAAO,IAAIN,iBAAiB,CAACM,KAAK,CAAC;MACrC;QACE;QACA;IACJ;EACF;;EAEA;EACA,IAAIF,aAAa,CAACM,IAAI,CAAC,EAAE;IACvB,OAAO,IAAIT,iBAAiB,CAACK,KAAK,CAAC;EACrC;;EAEA;EACA,MAAMe,aAAa,GAAG,2CAA2C;EACjE,IAAIA,aAAa,CAACC,IAAI,CAACZ,IAAI,CAAC,EAAE;IAC5B,OAAO,IAAIR,kBAAkB,CAACI,KAAK,CAAC;EACtC;;EAEA;EACA,IAAIE,WAAW,KAAK,IAAI,IAAI,CAACA,WAAW,EAAE;IACxC,OAAO,IAAIR,iBAAiB,CAACM,KAAK,CAAC;EACrC;;EAEA;EACA,OAAO,IAAIH,iBAAiB,CAACG,KAAK,CAAC;AACrC","ignoreList":[]}
@@ -0,0 +1,162 @@
1
+ import isChineseText from "./isChineseText.js";
2
+
3
+ /**
4
+ * Creates a Chinese helper function for matching words in sentences
5
+ * This is used to provide consistent Chinese text processing across all functions
6
+ *
7
+ * @returns {Function} Chinese helper function for matching words
8
+ */
9
+ export function createChineseMatchHelper() {
10
+ return function (sentence, word) {
11
+ const text = typeof sentence === 'string' ? sentence : sentence.text || sentence;
12
+ const matches = [];
13
+ if (!text || !word) {
14
+ return matches;
15
+ }
16
+ const lowerText = text.toLowerCase();
17
+ const lowerWord = word.toLowerCase();
18
+ let startIndex = 0;
19
+ let index;
20
+
21
+ // Use exact string matching for Chinese text
22
+ while ((index = lowerText.indexOf(lowerWord, startIndex)) !== -1) {
23
+ matches.push(word);
24
+ startIndex = index + lowerWord.length;
25
+ }
26
+ return matches;
27
+ };
28
+ }
29
+
30
+ /**
31
+ * Creates an enhanced Chinese helper function specifically for URL/slug matching
32
+ * This provides better matching logic for Chinese keywords in URL contexts
33
+ *
34
+ * @returns {Function} Enhanced Chinese helper function for URL matching
35
+ */
36
+ export function createChineseUrlMatchHelper() {
37
+ return function (slugText, chineseChar) {
38
+ const text = typeof slugText === 'string' ? slugText : slugText.text || slugText;
39
+ const matches = [];
40
+ if (!text || !chineseChar) {
41
+ return matches;
42
+ }
43
+ const lowerText = text.toLowerCase();
44
+ const lowerChar = chineseChar.toLowerCase();
45
+
46
+ // First, try exact character matching (for Chinese characters in slug)
47
+ let startIndex = 0;
48
+ let index;
49
+ while ((index = lowerText.indexOf(lowerChar, startIndex)) !== -1) {
50
+ matches.push(chineseChar);
51
+ startIndex = index + lowerChar.length;
52
+ }
53
+
54
+ // If no exact matches found and we have a Chinese character, try fuzzy matching
55
+ if (matches.length === 0 && isChineseText(chineseChar)) {
56
+ // For URL context, be more lenient - if the slug contains any related terms,
57
+ // we'll consider it a partial match to improve user experience
58
+ const hasRelevantContent = checkForRelevantContent(lowerText, lowerChar);
59
+ if (hasRelevantContent) {
60
+ matches.push(chineseChar);
61
+ }
62
+ }
63
+ return matches;
64
+ };
65
+ }
66
+
67
+ /**
68
+ * Checks if the slug contains content that might be related to the Chinese character
69
+ * This is a fuzzy matching approach for better URL keyword assessment
70
+ *
71
+ * @param {string} slugText - The slug text to check
72
+ * @param {string} chineseChar - The Chinese character to match
73
+ * @returns {boolean} True if relevant content is found
74
+ */
75
+ function checkForRelevantContent(slugText, chineseChar) {
76
+ // For common Chinese business/logistics terms, provide some fuzzy matching
77
+ const chineseToEnglishMap = {
78
+ 跨: ['cross', 'trans', 'inter', 'kuà', 'kua', 'kuajing'],
79
+ 境: ['border', 'boundary', 'realm', 'jìng', 'jing', 'kuajing'],
80
+ 物: ['goods', 'item', 'thing', 'material', 'wù', 'wu', 'wuliu'],
81
+ 流: ['flow', 'current', 'logistics', 'liú', 'liu', 'wuliu'],
82
+ 货: ['goods', 'cargo', 'freight', 'huò', 'huo'],
83
+ 运: ['transport', 'shipping', 'yùn', 'yun'],
84
+ 输: ['transport', 'transmit', 'shū', 'shu'],
85
+ 送: ['deliver', 'send', 'sòng', 'song'],
86
+ 配: ['distribute', 'match', 'pèi', 'pei'],
87
+ 仓: ['warehouse', 'storage', 'cāng', 'cang'],
88
+ 储: ['store', 'storage', 'chǔ', 'chu']
89
+ };
90
+ const possibleMatches = chineseToEnglishMap[chineseChar] || [];
91
+
92
+ // Check if slug contains any related terms
93
+ return possibleMatches.some(term => slugText.includes(term.toLowerCase()));
94
+ }
95
+
96
+ /**
97
+ * Gets appropriate match helper based on text content
98
+ * Auto-detects Chinese text and returns Chinese helper if needed
99
+ *
100
+ * @param {string} text - Text to analyze
101
+ * @param {string} keyword - Keyword to analyze
102
+ * @param {Function} existingHelper - Existing helper if available
103
+ * @param {boolean} isUrlContext - Whether this is for URL/slug matching
104
+ * @returns {Function|boolean} Appropriate helper function or false
105
+ */
106
+ export function getMatchHelper(text, keyword, existingHelper, isUrlContext = false) {
107
+ // If there's already a custom helper, use it
108
+ if (existingHelper) {
109
+ return existingHelper;
110
+ }
111
+
112
+ // Auto-detect Chinese and provide appropriate Chinese helper
113
+ if (isChineseText(keyword + ' ' + text)) {
114
+ return isUrlContext ? createChineseUrlMatchHelper() : createChineseMatchHelper();
115
+ }
116
+
117
+ // Return false for default behavior with other languages
118
+ return false;
119
+ }
120
+
121
+ /**
122
+ * Applies Chinese helper to researcher if Chinese text is detected
123
+ * This is a utility function for updating researchers dynamically
124
+ *
125
+ * @param {Object} researcher - The researcher object to update
126
+ * @param {string} text - Text to analyze
127
+ * @param {string} keyword - Keyword to analyze
128
+ * @param {boolean} isUrlContext - Whether this is for URL/slug matching
129
+ * @returns {Object} Updated researcher object
130
+ */
131
+ export function enhanceResearcherForChinese(researcher, text, keyword, isUrlContext = false) {
132
+ if (!researcher.getHelper('matchWordCustomHelper') && isChineseText(keyword + ' ' + text)) {
133
+ // Clone the researcher to avoid modifying original
134
+ const enhancedResearcher = Object.create(researcher);
135
+ enhancedResearcher.helpers = {
136
+ ...researcher.helpers
137
+ };
138
+ enhancedResearcher.helpers.matchWordCustomHelper = isUrlContext ? createChineseUrlMatchHelper() : createChineseMatchHelper();
139
+ return enhancedResearcher;
140
+ }
141
+ return researcher;
142
+ }
143
+
144
+ /**
145
+ * Checks if auto-Chinese enhancement should be applied
146
+ *
147
+ * @param {Object} researcher - The researcher object
148
+ * @param {string} text - Text to analyze
149
+ * @param {string} keyword - Keyword to analyze
150
+ * @returns {boolean} True if Chinese enhancement should be applied
151
+ */
152
+ export function shouldEnhanceForChinese(researcher, text, keyword) {
153
+ return !researcher.getHelper('matchWordCustomHelper') && isChineseText(keyword + ' ' + text);
154
+ }
155
+ export default {
156
+ createChineseMatchHelper,
157
+ createChineseUrlMatchHelper,
158
+ getMatchHelper,
159
+ enhanceResearcherForChinese,
160
+ shouldEnhanceForChinese
161
+ };
162
+ //# sourceMappingURL=chineseHelperFactory.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"chineseHelperFactory.js","names":["isChineseText","createChineseMatchHelper","sentence","word","text","matches","lowerText","toLowerCase","lowerWord","startIndex","index","indexOf","push","length","createChineseUrlMatchHelper","slugText","chineseChar","lowerChar","hasRelevantContent","checkForRelevantContent","chineseToEnglishMap","跨","境","物","流","货","运","输","送","配","仓","储","possibleMatches","some","term","includes","getMatchHelper","keyword","existingHelper","isUrlContext","enhanceResearcherForChinese","researcher","getHelper","enhancedResearcher","Object","create","helpers","matchWordCustomHelper","shouldEnhanceForChinese"],"sources":["../../../../src/languageProcessing/helpers/language/chineseHelperFactory.js"],"sourcesContent":["import isChineseText from './isChineseText.js';\n\n/**\n * Creates a Chinese helper function for matching words in sentences\n * This is used to provide consistent Chinese text processing across all functions\n *\n * @returns {Function} Chinese helper function for matching words\n */\nexport function createChineseMatchHelper() {\n return function(sentence, word) {\n const text = typeof sentence === 'string' ? sentence : sentence.text || sentence;\n const matches = [];\n\n if (!text || !word) {\n return matches;\n }\n\n const lowerText = text.toLowerCase();\n const lowerWord = word.toLowerCase();\n\n let startIndex = 0;\n let index;\n\n // Use exact string matching for Chinese text\n while ((index = lowerText.indexOf(lowerWord, startIndex)) !== -1) {\n matches.push(word);\n startIndex = index + lowerWord.length;\n }\n\n return matches;\n };\n}\n\n/**\n * Creates an enhanced Chinese helper function specifically for URL/slug matching\n * This provides better matching logic for Chinese keywords in URL contexts\n *\n * @returns {Function} Enhanced Chinese helper function for URL matching\n */\nexport function createChineseUrlMatchHelper() {\n return function(slugText, chineseChar) {\n const text = typeof slugText === 'string' ? slugText : slugText.text || slugText;\n const matches = [];\n\n if (!text || !chineseChar) {\n return matches;\n }\n\n const lowerText = text.toLowerCase();\n const lowerChar = chineseChar.toLowerCase();\n\n // First, try exact character matching (for Chinese characters in slug)\n let startIndex = 0;\n let index;\n while ((index = lowerText.indexOf(lowerChar, startIndex)) !== -1) {\n matches.push(chineseChar);\n startIndex = index + lowerChar.length;\n }\n\n // If no exact matches found and we have a Chinese character, try fuzzy matching\n if (matches.length === 0 && isChineseText(chineseChar)) {\n // For URL context, be more lenient - if the slug contains any related terms,\n // we'll consider it a partial match to improve user experience\n const hasRelevantContent = checkForRelevantContent(lowerText, lowerChar);\n if (hasRelevantContent) {\n matches.push(chineseChar);\n }\n }\n\n return matches;\n };\n}\n\n/**\n * Checks if the slug contains content that might be related to the Chinese character\n * This is a fuzzy matching approach for better URL keyword assessment\n *\n * @param {string} slugText - The slug text to check\n * @param {string} chineseChar - The Chinese character to match\n * @returns {boolean} True if relevant content is found\n */\nfunction checkForRelevantContent(slugText, chineseChar) {\n // For common Chinese business/logistics terms, provide some fuzzy matching\n const chineseToEnglishMap = {\n 跨: ['cross', 'trans', 'inter', 'kuà', 'kua', 'kuajing'],\n 境: ['border', 'boundary', 'realm', 'jìng', 'jing', 'kuajing'],\n 物: ['goods', 'item', 'thing', 'material', 'wù', 'wu', 'wuliu'],\n 流: ['flow', 'current', 'logistics', 'liú', 'liu', 'wuliu'],\n 货: ['goods', 'cargo', 'freight', 'huò', 'huo'],\n 运: ['transport', 'shipping', 'yùn', 'yun'],\n 输: ['transport', 'transmit', 'shū', 'shu'],\n 送: ['deliver', 'send', 'sòng', 'song'],\n 配: ['distribute', 'match', 'pèi', 'pei'],\n 仓: ['warehouse', 'storage', 'cāng', 'cang'],\n 储: ['store', 'storage', 'chǔ', 'chu']\n };\n\n const possibleMatches = chineseToEnglishMap[chineseChar] || [];\n\n // Check if slug contains any related terms\n return possibleMatches.some(term => slugText.includes(term.toLowerCase()));\n}\n\n/**\n * Gets appropriate match helper based on text content\n * Auto-detects Chinese text and returns Chinese helper if needed\n *\n * @param {string} text - Text to analyze\n * @param {string} keyword - Keyword to analyze\n * @param {Function} existingHelper - Existing helper if available\n * @param {boolean} isUrlContext - Whether this is for URL/slug matching\n * @returns {Function|boolean} Appropriate helper function or false\n */\nexport function getMatchHelper(text, keyword, existingHelper, isUrlContext = false) {\n // If there's already a custom helper, use it\n if (existingHelper) {\n return existingHelper;\n }\n\n // Auto-detect Chinese and provide appropriate Chinese helper\n if (isChineseText(keyword + ' ' + text)) {\n return isUrlContext ? createChineseUrlMatchHelper() : createChineseMatchHelper();\n }\n\n // Return false for default behavior with other languages\n return false;\n}\n\n/**\n * Applies Chinese helper to researcher if Chinese text is detected\n * This is a utility function for updating researchers dynamically\n *\n * @param {Object} researcher - The researcher object to update\n * @param {string} text - Text to analyze\n * @param {string} keyword - Keyword to analyze\n * @param {boolean} isUrlContext - Whether this is for URL/slug matching\n * @returns {Object} Updated researcher object\n */\nexport function enhanceResearcherForChinese(researcher, text, keyword, isUrlContext = false) {\n if (!researcher.getHelper('matchWordCustomHelper') && isChineseText(keyword + ' ' + text)) {\n // Clone the researcher to avoid modifying original\n const enhancedResearcher = Object.create(researcher);\n enhancedResearcher.helpers = {...researcher.helpers};\n enhancedResearcher.helpers.matchWordCustomHelper = isUrlContext\n ? createChineseUrlMatchHelper()\n : createChineseMatchHelper();\n return enhancedResearcher;\n }\n\n return researcher;\n}\n\n/**\n * Checks if auto-Chinese enhancement should be applied\n *\n * @param {Object} researcher - The researcher object\n * @param {string} text - Text to analyze\n * @param {string} keyword - Keyword to analyze\n * @returns {boolean} True if Chinese enhancement should be applied\n */\nexport function shouldEnhanceForChinese(researcher, text, keyword) {\n return !researcher.getHelper('matchWordCustomHelper') && isChineseText(keyword + ' ' + text);\n}\n\nexport default {\n createChineseMatchHelper,\n createChineseUrlMatchHelper,\n getMatchHelper,\n enhanceResearcherForChinese,\n shouldEnhanceForChinese\n};\n"],"mappings":"AAAA,OAAOA,aAAa;;AAEpB;AACA;AACA;AACA;AACA;AACA;AACA,OAAO,SAASC,wBAAwBA,CAAA,EAAG;EACzC,OAAO,UAASC,QAAQ,EAAEC,IAAI,EAAE;IAC9B,MAAMC,IAAI,GAAG,OAAOF,QAAQ,KAAK,QAAQ,GAAGA,QAAQ,GAAGA,QAAQ,CAACE,IAAI,IAAIF,QAAQ;IAChF,MAAMG,OAAO,GAAG,EAAE;IAElB,IAAI,CAACD,IAAI,IAAI,CAACD,IAAI,EAAE;MAClB,OAAOE,OAAO;IAChB;IAEA,MAAMC,SAAS,GAAGF,IAAI,CAACG,WAAW,CAAC,CAAC;IACpC,MAAMC,SAAS,GAAGL,IAAI,CAACI,WAAW,CAAC,CAAC;IAEpC,IAAIE,UAAU,GAAG,CAAC;IAClB,IAAIC,KAAK;;IAET;IACA,OAAO,CAACA,KAAK,GAAGJ,SAAS,CAACK,OAAO,CAACH,SAAS,EAAEC,UAAU,CAAC,MAAM,CAAC,CAAC,EAAE;MAChEJ,OAAO,CAACO,IAAI,CAACT,IAAI,CAAC;MAClBM,UAAU,GAAGC,KAAK,GAAGF,SAAS,CAACK,MAAM;IACvC;IAEA,OAAOR,OAAO;EAChB,CAAC;AACH;;AAEA;AACA;AACA;AACA;AACA;AACA;AACA,OAAO,SAASS,2BAA2BA,CAAA,EAAG;EAC5C,OAAO,UAASC,QAAQ,EAAEC,WAAW,EAAE;IACrC,MAAMZ,IAAI,GAAG,OAAOW,QAAQ,KAAK,QAAQ,GAAGA,QAAQ,GAAGA,QAAQ,CAACX,IAAI,IAAIW,QAAQ;IAChF,MAAMV,OAAO,GAAG,EAAE;IAElB,IAAI,CAACD,IAAI,IAAI,CAACY,WAAW,EAAE;MACzB,OAAOX,OAAO;IAChB;IAEA,MAAMC,SAAS,GAAGF,IAAI,CAACG,WAAW,CAAC,CAAC;IACpC,MAAMU,SAAS,GAAGD,WAAW,CAACT,WAAW,CAAC,CAAC;;IAE3C;IACA,IAAIE,UAAU,GAAG,CAAC;IAClB,IAAIC,KAAK;IACT,OAAO,CAACA,KAAK,GAAGJ,SAAS,CAACK,OAAO,CAACM,SAAS,EAAER,UAAU,CAAC,MAAM,CAAC,CAAC,EAAE;MAChEJ,OAAO,CAACO,IAAI,CAACI,WAAW,CAAC;MACzBP,UAAU,GAAGC,KAAK,GAAGO,SAAS,CAACJ,MAAM;IACvC;;IAEA;IACA,IAAIR,OAAO,CAACQ,MAAM,KAAK,CAAC,IAAIb,aAAa,CAACgB,WAAW,CAAC,EAAE;MACtD;MACA;MACA,MAAME,kBAAkB,GAAGC,uBAAuB,CAACb,SAAS,EAAEW,SAAS,CAAC;MACxE,IAAIC,kBAAkB,EAAE;QACtBb,OAAO,CAACO,IAAI,CAACI,WAAW,CAAC;MAC3B;IACF;IAEA,OAAOX,OAAO;EAChB,CAAC;AACH;;AAEA;AACA;AACA;AACA;AACA;AACA;AACA;AACA;AACA,SAASc,uBAAuBA,CAACJ,QAAQ,EAAEC,WAAW,EAAE;EACtD;EACA,MAAMI,mBAAmB,GAAG;IAC1BC,CAAC,EAAE,CAAC,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,KAAK,EAAE,KAAK,EAAE,SAAS,CAAC;IACvDC,CAAC,EAAE,CAAC,QAAQ,EAAE,UAAU,EAAE,OAAO,EAAE,MAAM,EAAE,MAAM,EAAE,SAAS,CAAC;IAC7DC,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,EAAE,OAAO,EAAE,UAAU,EAAE,IAAI,EAAE,IAAI,EAAE,OAAO,CAAC;IAC9DC,CAAC,EAAE,CAAC,MAAM,EAAE,SAAS,EAAE,WAAW,EAAE,KAAK,EAAE,KAAK,EAAE,OAAO,CAAC;IAC1DC,CAAC,EAAE,CAAC,OAAO,EAAE,OAAO,EAAE,SAAS,EAAE,KAAK,EAAE,KAAK,CAAC;IAC9CC,CAAC,EAAE,CAAC,WAAW,EAAE,UAAU,EAAE,KAAK,EAAE,KAAK,CAAC;IAC1CC,CAAC,EAAE,CAAC,WAAW,EAAE,UAAU,EAAE,KAAK,EAAE,KAAK,CAAC;IAC1CC,CAAC,EAAE,CAAC,SAAS,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,CAAC;IACtCC,CAAC,EAAE,CAAC,YAAY,EAAE,OAAO,EAAE,KAAK,EAAE,KAAK,CAAC;IACxCC,CAAC,EAAE,CAAC,WAAW,EAAE,SAAS,EAAE,MAAM,EAAE,MAAM,CAAC;IAC3CC,CAAC,EAAE,CAAC,OAAO,EAAE,SAAS,EAAE,KAAK,EAAE,KAAK;EACtC,CAAC;EAED,MAAMC,eAAe,GAAGZ,mBAAmB,CAACJ,WAAW,CAAC,IAAI,EAAE;;EAE9D;EACA,OAAOgB,eAAe,CAACC,IAAI,CAACC,IAAI,IAAInB,QAAQ,CAACoB,QAAQ,CAACD,IAAI,CAAC3B,WAAW,CAAC,CAAC,CAAC,CAAC;AAC5E;;AAEA;AACA;AACA;AACA;AACA;AACA;AACA;AACA;AACA;AACA;AACA,OAAO,SAAS6B,cAAcA,CAAChC,IAAI,EAAEiC,OAAO,EAAEC,cAAc,EAAEC,YAAY,GAAG,KAAK,EAAE;EAClF;EACA,IAAID,cAAc,EAAE;IAClB,OAAOA,cAAc;EACvB;;EAEA;EACA,IAAItC,aAAa,CAACqC,OAAO,GAAG,GAAG,GAAGjC,IAAI,CAAC,EAAE;IACvC,OAAOmC,YAAY,GAAGzB,2BAA2B,CAAC,CAAC,GAAGb,wBAAwB,CAAC,CAAC;EAClF;;EAEA;EACA,OAAO,KAAK;AACd;;AAEA;AACA;AACA;AACA;AACA;AACA;AACA;AACA;AACA;AACA;AACA,OAAO,SAASuC,2BAA2BA,CAACC,UAAU,EAAErC,IAAI,EAAEiC,OAAO,EAAEE,YAAY,GAAG,KAAK,EAAE;EAC3F,IAAI,CAACE,UAAU,CAACC,SAAS,CAAC,uBAAuB,CAAC,IAAI1C,aAAa,CAACqC,OAAO,GAAG,GAAG,GAAGjC,IAAI,CAAC,EAAE;IACzF;IACA,MAAMuC,kBAAkB,GAAGC,MAAM,CAACC,MAAM,CAACJ,UAAU,CAAC;IACpDE,kBAAkB,CAACG,OAAO,GAAG;MAAC,GAAGL,UAAU,CAACK;IAAO,CAAC;IACpDH,kBAAkB,CAACG,OAAO,CAACC,qBAAqB,GAAGR,YAAY,GAC3DzB,2BAA2B,CAAC,CAAC,GAC7Bb,wBAAwB,CAAC,CAAC;IAC9B,OAAO0C,kBAAkB;EAC3B;EAEA,OAAOF,UAAU;AACnB;;AAEA;AACA;AACA;AACA;AACA;AACA;AACA;AACA;AACA,OAAO,SAASO,uBAAuBA,CAACP,UAAU,EAAErC,IAAI,EAAEiC,OAAO,EAAE;EACjE,OAAO,CAACI,UAAU,CAACC,SAAS,CAAC,uBAAuB,CAAC,IAAI1C,aAAa,CAACqC,OAAO,GAAG,GAAG,GAAGjC,IAAI,CAAC;AAC9F;AAEA,eAAe;EACbH,wBAAwB;EACxBa,2BAA2B;EAC3BsB,cAAc;EACdI,2BAA2B;EAC3BQ;AACF,CAAC","ignoreList":[]}
@@ -0,0 +1,17 @@
1
+ /**
2
+ * Checks if the given text contains Chinese characters.
3
+ *
4
+ * @param {string} text The text to check.
5
+ *
6
+ * @returns {boolean} True if the text contains Chinese characters, false otherwise.
7
+ */
8
+ export default function isChineseText(text) {
9
+ if (!text || typeof text !== 'string') {
10
+ return false;
11
+ }
12
+
13
+ // Chinese character range in Unicode
14
+ const chineseCharRegex = /[\u4e00-\u9fff]/;
15
+ return chineseCharRegex.test(text);
16
+ }
17
+ //# sourceMappingURL=isChineseText.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"isChineseText.js","names":["isChineseText","text","chineseCharRegex","test"],"sources":["../../../../src/languageProcessing/helpers/language/isChineseText.js"],"sourcesContent":["/**\n * Checks if the given text contains Chinese characters.\n *\n * @param {string} text The text to check.\n *\n * @returns {boolean} True if the text contains Chinese characters, false otherwise.\n */\nexport default function isChineseText(text) {\n if (!text || typeof text !== 'string') {\n return false;\n }\n\n // Chinese character range in Unicode\n const chineseCharRegex = /[\\u4e00-\\u9fff]/;\n\n return chineseCharRegex.test(text);\n}\n"],"mappings":"AAAA;AACA;AACA;AACA;AACA;AACA;AACA;AACA,eAAe,SAASA,aAAaA,CAACC,IAAI,EAAE;EAC1C,IAAI,CAACA,IAAI,IAAI,OAAOA,IAAI,KAAK,QAAQ,EAAE;IACrC,OAAO,KAAK;EACd;;EAEA;EACA,MAAMC,gBAAgB,GAAG,iBAAiB;EAE1C,OAAOA,gBAAgB,CAACC,IAAI,CAACF,IAAI,CAAC;AACpC","ignoreList":[]}
@@ -6,7 +6,7 @@ import removePunctuation from "../sanitize/removePunctuation.js";
6
6
  import { unifyAllSpaces as unifyWhitespace } from "../sanitize/unifyWhitespace.js";
7
7
  import matchStringWithTransliteration from "./matchTextWithTransliteration.js";
8
8
  import { normalize as normalizeQuotes } from "../sanitize/quotes.js";
9
- import { map } from "lodash";
9
+ import { map } from 'lodash';
10
10
 
11
11
  /**
12
12
  * Returns the number of matches in a given string
@@ -1 +1 @@
1
- {"version":3,"file":"matchTextWithWord.js","names":["stripSomeTags","stripSpaces","removePunctuation","unifyAllSpaces","unifyWhitespace","matchStringWithTransliteration","normalize","normalizeQuotes","map","text","wordToMatch","locale","matchWordCustomHelper","matches","keyword","positions","indexOf","count","length","position","Math","min"],"sources":["../../../../src/languageProcessing/helpers/match/matchTextWithWord.js"],"sourcesContent":["/** @module stringProcessing/matchTextWithWord */\n\nimport stripSomeTags from \"../sanitize/stripNonTextTags.js\";\nimport stripSpaces from \"../sanitize/stripSpaces.js\";\nimport removePunctuation from \"../sanitize/removePunctuation.js\";\nimport { unifyAllSpaces as unifyWhitespace } from \"../sanitize/unifyWhitespace.js\";\nimport matchStringWithTransliteration from \"./matchTextWithTransliteration.js\";\nimport { normalize as normalizeQuotes } from \"../sanitize/quotes.js\";\nimport { map } from \"lodash\";\n\n/**\n * Returns the number of matches in a given string\n *\n * @param {string} text The text to use for matching the wordToMatch.\n * @param {string} wordToMatch The word to match in the text.\n * @param {string} locale \t\t\t\tThe locale used for transliteration.\n * @param {function} matchWordCustomHelper \tThe helper function to match word in text.\n *\n * @returns {Object} An array with all matches of the text, the number of the matches, and the lowest number of positions of the matches.\n */\nexport default function( text, wordToMatch, locale, matchWordCustomHelper ) {\n\ttext = stripSomeTags( text );\n\ttext = unifyWhitespace( text );\n\ttext = normalizeQuotes( text );\n\n\twordToMatch = normalizeQuotes( wordToMatch );\n\tlet matches = matchWordCustomHelper\n\t\t? matchWordCustomHelper( text, wordToMatch )\n\t\t: matchStringWithTransliteration( text, wordToMatch, locale );\n\n\tmatches = map( matches, function( keyword ) {\n\t\treturn stripSpaces( removePunctuation( keyword ) );\n\t} );\n\n\t// Create an array of positions of matches to determine where in the text the wordToMatch occurred first.\n\tconst positions = map( matches, function( keyword ) {\n\t\treturn text.indexOf( keyword );\n\t} );\n\n\treturn {\n\t\tcount: matches.length,\n\t\tmatches: matches,\n\t\tposition: positions.length === 0 ? -1 : Math.min( ...positions ),\n\t};\n}\n"],"mappings":"AAAA;;AAEA,OAAOA,aAAa;AACpB,OAAOC,WAAW;AAClB,OAAOC,iBAAiB;AACxB,SAASC,cAAc,IAAIC,eAAe;AAC1C,OAAOC,8BAA8B;AACrC,SAASC,SAAS,IAAIC,eAAe;AACrC,SAASC,GAAG,QAAQ,QAAQ;;AAE5B;AACA;AACA;AACA;AACA;AACA;AACA;AACA;AACA;AACA;AACA,eAAe,UAAUC,IAAI,EAAEC,WAAW,EAAEC,MAAM,EAAEC,qBAAqB,EAAG;EAC3EH,IAAI,GAAGT,aAAa,CAAES,IAAK,CAAC;EAC5BA,IAAI,GAAGL,eAAe,CAAEK,IAAK,CAAC;EAC9BA,IAAI,GAAGF,eAAe,CAAEE,IAAK,CAAC;EAE9BC,WAAW,GAAGH,eAAe,CAAEG,WAAY,CAAC;EAC5C,IAAIG,OAAO,GAAGD,qBAAqB,GAChCA,qBAAqB,CAAEH,IAAI,EAAEC,WAAY,CAAC,GAC1CL,8BAA8B,CAAEI,IAAI,EAAEC,WAAW,EAAEC,MAAO,CAAC;EAE9DE,OAAO,GAAGL,GAAG,CAAEK,OAAO,EAAE,UAAUC,OAAO,EAAG;IAC3C,OAAOb,WAAW,CAAEC,iBAAiB,CAAEY,OAAQ,CAAE,CAAC;EACnD,CAAE,CAAC;;EAEH;EACA,MAAMC,SAAS,GAAGP,GAAG,CAAEK,OAAO,EAAE,UAAUC,OAAO,EAAG;IACnD,OAAOL,IAAI,CAACO,OAAO,CAAEF,OAAQ,CAAC;EAC/B,CAAE,CAAC;EAEH,OAAO;IACNG,KAAK,EAAEJ,OAAO,CAACK,MAAM;IACrBL,OAAO,EAAEA,OAAO;IAChBM,QAAQ,EAAEJ,SAAS,CAACG,MAAM,KAAK,CAAC,GAAG,CAAC,CAAC,GAAGE,IAAI,CAACC,GAAG,CAAE,GAAGN,SAAU;EAChE,CAAC;AACF","ignoreList":[]}
1
+ {"version":3,"file":"matchTextWithWord.js","names":["stripSomeTags","stripSpaces","removePunctuation","unifyAllSpaces","unifyWhitespace","matchStringWithTransliteration","normalize","normalizeQuotes","map","text","wordToMatch","locale","matchWordCustomHelper","matches","keyword","positions","indexOf","count","length","position","Math","min"],"sources":["../../../../src/languageProcessing/helpers/match/matchTextWithWord.js"],"sourcesContent":["/** @module stringProcessing/matchTextWithWord */\n\nimport stripSomeTags from '../sanitize/stripNonTextTags.js';\nimport stripSpaces from '../sanitize/stripSpaces.js';\nimport removePunctuation from '../sanitize/removePunctuation.js';\nimport {unifyAllSpaces as unifyWhitespace} from '../sanitize/unifyWhitespace.js';\nimport matchStringWithTransliteration from './matchTextWithTransliteration.js';\nimport {normalize as normalizeQuotes} from '../sanitize/quotes.js';\nimport {map} from 'lodash';\n\n/**\n * Returns the number of matches in a given string\n *\n * @param {string} text The text to use for matching the wordToMatch.\n * @param {string} wordToMatch The word to match in the text.\n * @param {string} locale \t\t\t\tThe locale used for transliteration.\n * @param {function} matchWordCustomHelper \tThe helper function to match word in text.\n *\n * @returns {Object} An array with all matches of the text, the number of the matches, and the lowest number of positions of the matches.\n */\nexport default function(text, wordToMatch, locale, matchWordCustomHelper) {\n text = stripSomeTags(text);\n text = unifyWhitespace(text);\n text = normalizeQuotes(text);\n\n wordToMatch = normalizeQuotes(wordToMatch);\n let matches = matchWordCustomHelper\n ? matchWordCustomHelper(text, wordToMatch)\n : matchStringWithTransliteration(text, wordToMatch, locale);\n matches = map(matches, function(keyword) {\n return stripSpaces(removePunctuation(keyword));\n });\n\n // Create an array of positions of matches to determine where in the text the wordToMatch occurred first.\n const positions = map(matches, function(keyword) {\n return text.indexOf(keyword);\n });\n\n return {\n count: matches.length,\n matches: matches,\n position: positions.length === 0 ? -1 : Math.min(...positions)\n };\n}\n"],"mappings":"AAAA;;AAEA,OAAOA,aAAa;AACpB,OAAOC,WAAW;AAClB,OAAOC,iBAAiB;AACxB,SAAQC,cAAc,IAAIC,eAAe;AACzC,OAAOC,8BAA8B;AACrC,SAAQC,SAAS,IAAIC,eAAe;AACpC,SAAQC,GAAG,QAAO,QAAQ;;AAE1B;AACA;AACA;AACA;AACA;AACA;AACA;AACA;AACA;AACA;AACA,eAAe,UAASC,IAAI,EAAEC,WAAW,EAAEC,MAAM,EAAEC,qBAAqB,EAAE;EACxEH,IAAI,GAAGT,aAAa,CAACS,IAAI,CAAC;EAC1BA,IAAI,GAAGL,eAAe,CAACK,IAAI,CAAC;EAC5BA,IAAI,GAAGF,eAAe,CAACE,IAAI,CAAC;EAE5BC,WAAW,GAAGH,eAAe,CAACG,WAAW,CAAC;EAC1C,IAAIG,OAAO,GAAGD,qBAAqB,GAC/BA,qBAAqB,CAACH,IAAI,EAAEC,WAAW,CAAC,GACxCL,8BAA8B,CAACI,IAAI,EAAEC,WAAW,EAAEC,MAAM,CAAC;EAC7DE,OAAO,GAAGL,GAAG,CAACK,OAAO,EAAE,UAASC,OAAO,EAAE;IACvC,OAAOb,WAAW,CAACC,iBAAiB,CAACY,OAAO,CAAC,CAAC;EAChD,CAAC,CAAC;;EAEF;EACA,MAAMC,SAAS,GAAGP,GAAG,CAACK,OAAO,EAAE,UAASC,OAAO,EAAE;IAC/C,OAAOL,IAAI,CAACO,OAAO,CAACF,OAAO,CAAC;EAC9B,CAAC,CAAC;EAEF,OAAO;IACLG,KAAK,EAAEJ,OAAO,CAACK,MAAM;IACrBL,OAAO,EAAEA,OAAO;IAChBM,QAAQ,EAAEJ,SAAS,CAACG,MAAM,KAAK,CAAC,GAAG,CAAC,CAAC,GAAGE,IAAI,CAACC,GAAG,CAAC,GAAGN,SAAS;EAC/D,CAAC;AACH","ignoreList":[]}
@@ -1,8 +1,8 @@
1
1
  /** @module stringProcessing/countWords */
2
2
  import sanitizeString from "../sanitize/sanitizeString";
3
- import { filter, flatMap } from "lodash";
3
+ import { filter, flatMap } from 'lodash';
4
4
  import removePunctuation, { punctuationRegexString } from "../sanitize/removePunctuation.js";
5
- const punctuationRegex = new RegExp(`([${punctuationRegexString}])`, "g");
5
+ const punctuationRegex = new RegExp(`([${punctuationRegexString}])`, 'g');
6
6
 
7
7
  /**
8
8
  * Returns an array with words used in the text.
@@ -13,25 +13,33 @@ const punctuationRegex = new RegExp(`([${punctuationRegexString}])`, "g");
13
13
  *
14
14
  * @returns {Array} The array with all words.
15
15
  */
16
- export default function (text, wordBoundaryRegexString = "\\s", shouldRemovePunctuation = true) {
16
+ export default function (text, wordBoundaryRegexString = '\\s', shouldRemovePunctuation = true) {
17
17
  // Unify whitespaces and non-breaking spaces, remove table of content and strip the tags and multiple spaces.
18
18
  text = sanitizeString(text);
19
- if (text === "") {
19
+ if (text === '') {
20
20
  return [];
21
21
  }
22
- const wordBoundaryRegex = new RegExp(wordBoundaryRegexString, "g");
23
- let words = text.split(wordBoundaryRegex);
24
- if (shouldRemovePunctuation) {
25
- words = words.map(removePunctuation);
22
+ const chineseCharRegex = /[\u4e00-\u9fff]/;
23
+ const hasChinese = chineseCharRegex.test(text);
24
+ const hasNoSpaces = !/\s/.test(text);
25
+ let words = [];
26
+ if (hasChinese && hasNoSpaces) {
27
+ words = Array.from(text).filter(char => chineseCharRegex.test(char));
26
28
  } else {
27
- // If punctuation is not removed, punctuation marks are tokenized as if they were words.
28
- words = flatMap(words, word => {
29
- const newWord = word.replace(punctuationRegex, " $1 ");
30
- return newWord.split(" ");
31
- });
29
+ const wordBoundaryRegex = new RegExp(wordBoundaryRegexString, 'g');
30
+ words = text.split(wordBoundaryRegex);
31
+ if (shouldRemovePunctuation) {
32
+ words = words.map(removePunctuation);
33
+ } else {
34
+ // If punctuation is not removed, punctuation marks are tokenized as if they were words.
35
+ words = flatMap(words, word => {
36
+ const newWord = word.replace(punctuationRegex, ' $1 ');
37
+ return newWord.split(' ');
38
+ });
39
+ }
32
40
  }
33
41
  return filter(words, function (word) {
34
- return word.trim() !== "";
42
+ return word.trim() !== '';
35
43
  });
36
44
  }
37
45
  //# sourceMappingURL=getWords.js.map
@@ -1 +1 @@
1
- {"version":3,"file":"getWords.js","names":["sanitizeString","filter","flatMap","removePunctuation","punctuationRegexString","punctuationRegex","RegExp","text","wordBoundaryRegexString","shouldRemovePunctuation","wordBoundaryRegex","words","split","map","word","newWord","replace","trim"],"sources":["../../../../src/languageProcessing/helpers/word/getWords.js"],"sourcesContent":["/** @module stringProcessing/countWords */\nimport sanitizeString from \"../sanitize/sanitizeString\";\nimport { filter, flatMap } from \"lodash\";\nimport removePunctuation, { punctuationRegexString } from \"../sanitize/removePunctuation.js\";\n\nconst punctuationRegex = new RegExp( `([${punctuationRegexString}])`, \"g\" );\n\n/**\n * Returns an array with words used in the text.\n *\n * @param {string} text The text to be counted.\n * @param {string} [wordBoundaryRegexString=\\\\s] The regex string for the word boundary that should be used to split the text into words.\n * @param {boolean} [shouldRemovePunctuation=true] If punctuation should be removed. Defaults to `true`.\n *\n * @returns {Array} The array with all words.\n */\nexport default function( text, wordBoundaryRegexString = \"\\\\s\", shouldRemovePunctuation = true ) {\n\t// Unify whitespaces and non-breaking spaces, remove table of content and strip the tags and multiple spaces.\n\ttext = sanitizeString( text );\n\n\tif ( text === \"\" ) {\n\t\treturn [];\n\t}\n\n\tconst wordBoundaryRegex = new RegExp( wordBoundaryRegexString, \"g\" );\n\n\tlet words = text.split( wordBoundaryRegex );\n\n\tif ( shouldRemovePunctuation ) {\n\t\twords = words.map( removePunctuation );\n\t} else {\n\t\t// If punctuation is not removed, punctuation marks are tokenized as if they were words.\n\t\twords = flatMap( words, ( word ) => {\n\t\t\tconst newWord = word.replace( punctuationRegex, \" $1 \" );\n\t\t\treturn newWord.split( \" \" );\n\t\t} );\n\t}\n\n\treturn filter( words, function( word ) {\n\t\treturn word.trim() !== \"\";\n\t} );\n}\n\n"],"mappings":"AAAA;AACA,OAAOA,cAAc;AACrB,SAASC,MAAM,EAAEC,OAAO,QAAQ,QAAQ;AACxC,OAAOC,iBAAiB,IAAIC,sBAAsB;AAElD,MAAMC,gBAAgB,GAAG,IAAIC,MAAM,CAAE,KAAKF,sBAAsB,IAAI,EAAE,GAAI,CAAC;;AAE3E;AACA;AACA;AACA;AACA;AACA;AACA;AACA;AACA;AACA,eAAe,UAAUG,IAAI,EAAEC,uBAAuB,GAAG,KAAK,EAAEC,uBAAuB,GAAG,IAAI,EAAG;EAChG;EACAF,IAAI,GAAGP,cAAc,CAAEO,IAAK,CAAC;EAE7B,IAAKA,IAAI,KAAK,EAAE,EAAG;IAClB,OAAO,EAAE;EACV;EAEA,MAAMG,iBAAiB,GAAG,IAAIJ,MAAM,CAAEE,uBAAuB,EAAE,GAAI,CAAC;EAEpE,IAAIG,KAAK,GAAGJ,IAAI,CAACK,KAAK,CAAEF,iBAAkB,CAAC;EAE3C,IAAKD,uBAAuB,EAAG;IAC9BE,KAAK,GAAGA,KAAK,CAACE,GAAG,CAAEV,iBAAkB,CAAC;EACvC,CAAC,MAAM;IACN;IACAQ,KAAK,GAAGT,OAAO,CAAES,KAAK,EAAIG,IAAI,IAAM;MACnC,MAAMC,OAAO,GAAGD,IAAI,CAACE,OAAO,CAAEX,gBAAgB,EAAE,MAAO,CAAC;MACxD,OAAOU,OAAO,CAACH,KAAK,CAAE,GAAI,CAAC;IAC5B,CAAE,CAAC;EACJ;EAEA,OAAOX,MAAM,CAAEU,KAAK,EAAE,UAAUG,IAAI,EAAG;IACtC,OAAOA,IAAI,CAACG,IAAI,CAAC,CAAC,KAAK,EAAE;EAC1B,CAAE,CAAC;AACJ","ignoreList":[]}
1
+ {"version":3,"file":"getWords.js","names":["sanitizeString","filter","flatMap","removePunctuation","punctuationRegexString","punctuationRegex","RegExp","text","wordBoundaryRegexString","shouldRemovePunctuation","chineseCharRegex","hasChinese","test","hasNoSpaces","words","Array","from","char","wordBoundaryRegex","split","map","word","newWord","replace","trim"],"sources":["../../../../src/languageProcessing/helpers/word/getWords.js"],"sourcesContent":["/** @module stringProcessing/countWords */\nimport sanitizeString from '../sanitize/sanitizeString';\nimport {filter, flatMap} from 'lodash';\nimport removePunctuation, {punctuationRegexString} from '../sanitize/removePunctuation.js';\n\nconst punctuationRegex = new RegExp(`([${punctuationRegexString}])`, 'g');\n\n/**\n * Returns an array with words used in the text.\n *\n * @param {string} text The text to be counted.\n * @param {string} [wordBoundaryRegexString=\\\\s] The regex string for the word boundary that should be used to split the text into words.\n * @param {boolean} [shouldRemovePunctuation=true] If punctuation should be removed. Defaults to `true`.\n *\n * @returns {Array} The array with all words.\n */\nexport default function(text, wordBoundaryRegexString = '\\\\s', shouldRemovePunctuation = true) {\n // Unify whitespaces and non-breaking spaces, remove table of content and strip the tags and multiple spaces.\n text = sanitizeString(text);\n\n if (text === '') {\n return [];\n }\n\n const chineseCharRegex = /[\\u4e00-\\u9fff]/;\n const hasChinese = chineseCharRegex.test(text);\n const hasNoSpaces = !/\\s/.test(text);\n\n let words = [];\n\n if (hasChinese && hasNoSpaces) {\n words = Array.from(text).filter(char => chineseCharRegex.test(char));\n } else {\n const wordBoundaryRegex = new RegExp(wordBoundaryRegexString, 'g');\n\n words = text.split(wordBoundaryRegex);\n\n if (shouldRemovePunctuation) {\n words = words.map(removePunctuation);\n } else {\n // If punctuation is not removed, punctuation marks are tokenized as if they were words.\n words = flatMap(words, word => {\n const newWord = word.replace(punctuationRegex, ' $1 ');\n return newWord.split(' ');\n });\n }\n }\n\n return filter(words, function(word) {\n return word.trim() !== '';\n });\n}\n"],"mappings":"AAAA;AACA,OAAOA,cAAc;AACrB,SAAQC,MAAM,EAAEC,OAAO,QAAO,QAAQ;AACtC,OAAOC,iBAAiB,IAAGC,sBAAsB;AAEjD,MAAMC,gBAAgB,GAAG,IAAIC,MAAM,CAAC,KAAKF,sBAAsB,IAAI,EAAE,GAAG,CAAC;;AAEzE;AACA;AACA;AACA;AACA;AACA;AACA;AACA;AACA;AACA,eAAe,UAASG,IAAI,EAAEC,uBAAuB,GAAG,KAAK,EAAEC,uBAAuB,GAAG,IAAI,EAAE;EAC7F;EACAF,IAAI,GAAGP,cAAc,CAACO,IAAI,CAAC;EAE3B,IAAIA,IAAI,KAAK,EAAE,EAAE;IACf,OAAO,EAAE;EACX;EAEA,MAAMG,gBAAgB,GAAG,iBAAiB;EAC1C,MAAMC,UAAU,GAAGD,gBAAgB,CAACE,IAAI,CAACL,IAAI,CAAC;EAC9C,MAAMM,WAAW,GAAG,CAAC,IAAI,CAACD,IAAI,CAACL,IAAI,CAAC;EAEpC,IAAIO,KAAK,GAAG,EAAE;EAEd,IAAIH,UAAU,IAAIE,WAAW,EAAE;IAC7BC,KAAK,GAAGC,KAAK,CAACC,IAAI,CAACT,IAAI,CAAC,CAACN,MAAM,CAACgB,IAAI,IAAIP,gBAAgB,CAACE,IAAI,CAACK,IAAI,CAAC,CAAC;EACtE,CAAC,MAAM;IACL,MAAMC,iBAAiB,GAAG,IAAIZ,MAAM,CAACE,uBAAuB,EAAE,GAAG,CAAC;IAElEM,KAAK,GAAGP,IAAI,CAACY,KAAK,CAACD,iBAAiB,CAAC;IAErC,IAAIT,uBAAuB,EAAE;MAC3BK,KAAK,GAAGA,KAAK,CAACM,GAAG,CAACjB,iBAAiB,CAAC;IACtC,CAAC,MAAM;MACL;MACAW,KAAK,GAAGZ,OAAO,CAACY,KAAK,EAAEO,IAAI,IAAI;QAC7B,MAAMC,OAAO,GAAGD,IAAI,CAACE,OAAO,CAAClB,gBAAgB,EAAE,MAAM,CAAC;QACtD,OAAOiB,OAAO,CAACH,KAAK,CAAC,GAAG,CAAC;MAC3B,CAAC,CAAC;IACJ;EACF;EAEA,OAAOlB,MAAM,CAACa,KAAK,EAAE,UAASO,IAAI,EAAE;IAClC,OAAOA,IAAI,CAACG,IAAI,CAAC,CAAC,KAAK,EAAE;EAC3B,CAAC,CAAC;AACJ","ignoreList":[]}
@@ -0,0 +1,41 @@
1
+ import { languageProcessing } from "../../../index.js";
2
+ const {
3
+ AbstractResearcher
4
+ } = languageProcessing;
5
+
6
+ // All helpers
7
+ import matchWordCustomHelper from "./helpers/matchTextWithWord";
8
+ import splitIntoTokensCustom from "./helpers/splitIntoTokensCustom";
9
+ import getSentencesCustom from "./helpers/getSentences";
10
+
11
+ // All config
12
+ import { all as functionWords } from "./config/functionWords";
13
+
14
+ /**
15
+ * The researcher contains all the researches, helpers, data, and config for Chinese language.
16
+ */
17
+ export default class Researcher extends AbstractResearcher {
18
+ /**
19
+ * Constructor
20
+ * @param {Paper} paper The Paper object that is needed within the researches.
21
+ * @constructor
22
+ */
23
+ constructor(paper) {
24
+ super(paper);
25
+
26
+ // Delete researches that are not suitable for Chinese
27
+ delete this.defaultResearches.getFleschReadingScore;
28
+ Object.assign(this.config, {
29
+ language: 'zh',
30
+ functionWords,
31
+ // Chinese doesn't use hyphens as word boundaries
32
+ areHyphensWordBoundaries: false
33
+ });
34
+ Object.assign(this.helpers, {
35
+ matchWordCustomHelper,
36
+ splitIntoTokensCustom,
37
+ getSentencesCustom
38
+ });
39
+ }
40
+ }
41
+ //# sourceMappingURL=Researcher.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"Researcher.js","names":["languageProcessing","AbstractResearcher","matchWordCustomHelper","splitIntoTokensCustom","getSentencesCustom","all","functionWords","Researcher","constructor","paper","defaultResearches","getFleschReadingScore","Object","assign","config","language","areHyphensWordBoundaries","helpers"],"sources":["../../../../src/languageProcessing/languages/zh/Researcher.js"],"sourcesContent":["import {languageProcessing} from '@axyseo/index.js';\nconst {AbstractResearcher} = languageProcessing;\n\n// All helpers\nimport matchWordCustomHelper from './helpers/matchTextWithWord';\nimport splitIntoTokensCustom from './helpers/splitIntoTokensCustom';\nimport getSentencesCustom from './helpers/getSentences';\n\n// All config\nimport {all as functionWords} from './config/functionWords';\n\n/**\n * The researcher contains all the researches, helpers, data, and config for Chinese language.\n */\nexport default class Researcher extends AbstractResearcher {\n /**\n * Constructor\n * @param {Paper} paper The Paper object that is needed within the researches.\n * @constructor\n */\n constructor(paper) {\n super(paper);\n\n // Delete researches that are not suitable for Chinese\n delete this.defaultResearches.getFleschReadingScore;\n\n Object.assign(this.config, {\n language: 'zh',\n functionWords,\n // Chinese doesn't use hyphens as word boundaries\n areHyphensWordBoundaries: false\n });\n\n Object.assign(this.helpers, {\n matchWordCustomHelper,\n splitIntoTokensCustom,\n getSentencesCustom\n });\n }\n}\n"],"mappings":"AAAA,SAAQA,kBAAkB;AAC1B,MAAM;EAACC;AAAkB,CAAC,GAAGD,kBAAkB;;AAE/C;AACA,OAAOE,qBAAqB;AAC5B,OAAOC,qBAAqB;AAC5B,OAAOC,kBAAkB;;AAEzB;AACA,SAAQC,GAAG,IAAIC,aAAa;;AAE5B;AACA;AACA;AACA,eAAe,MAAMC,UAAU,SAASN,kBAAkB,CAAC;EACzD;AACF;AACA;AACA;AACA;EACEO,WAAWA,CAACC,KAAK,EAAE;IACjB,KAAK,CAACA,KAAK,CAAC;;IAEZ;IACA,OAAO,IAAI,CAACC,iBAAiB,CAACC,qBAAqB;IAEnDC,MAAM,CAACC,MAAM,CAAC,IAAI,CAACC,MAAM,EAAE;MACzBC,QAAQ,EAAE,IAAI;MACdT,aAAa;MACb;MACAU,wBAAwB,EAAE;IAC5B,CAAC,CAAC;IAEFJ,MAAM,CAACC,MAAM,CAAC,IAAI,CAACI,OAAO,EAAE;MAC1Bf,qBAAqB;MACrBC,qBAAqB;MACrBC;IACF,CAAC,CAAC;EACJ;AACF","ignoreList":[]}
@@ -0,0 +1,40 @@
1
+ /**
2
+ * Returns an object with Chinese function words.
3
+ *
4
+ * @returns {Array} The array of Chinese function words.
5
+ */
6
+
7
+ // Articles and determiners
8
+ const articles = ['这', '那', '这个', '那个', '这些', '那些', '一个', '一些'];
9
+
10
+ // Pronouns
11
+ const pronouns = [
12
+ // Personal pronouns
13
+ '我', '你', '他', '她', '它', '我们', '你们', '他们', '她们', '它们',
14
+ // Possessive pronouns
15
+ '我的', '你的', '他的', '她的', '它的', '我们的', '你们的', '他们的', '她们的', '它们的',
16
+ // Demonstrative pronouns
17
+ '这', '那', '这个', '那个', '这些', '那些', '这里', '那里', '这儿', '那儿'];
18
+
19
+ // Prepositions
20
+ const prepositions = ['在', '从', '到', '向', '朝', '往', '由', '被', '把', '对', '为', '给', '跟', '和', '与', '同', '关于', '按照', '根据'];
21
+
22
+ // Conjunctions
23
+ const conjunctions = ['和', '或', '但', '但是', '可是', '然而', '不过', '而且', '并且', '以及', '还有', '或者', '要么', '因为', '所以', '如果', '要是', '虽然', '尽管'];
24
+
25
+ // Auxiliary verbs and modal verbs
26
+ const auxiliaries = ['是', '有', '没', '没有', '能', '可以', '会', '要', '想', '应该', '必须', '可能', '或许', '也许', '将', '将要', '正在', '已经', '刚刚'];
27
+
28
+ // Quantifiers and numbers
29
+ const quantifiers = ['一', '二', '三', '四', '五', '六', '七', '八', '九', '十', '百', '千', '万', '亿', '个', '只', '条', '本', '张', '件', '台', '辆', '所', '座', '栋', '间', '层', '些', '点', '种', '样', '次', '遍', '回'];
30
+
31
+ // Adverbs
32
+ const adverbs = ['很', '非常', '特别', '十分', '极', '太', '更', '最', '还', '也', '都', '只', '就', '才', '又', '再', '还是', '总是', '经常', '有时', '偶尔', '从来', '永远'];
33
+
34
+ // Particles
35
+ const particles = ['的', '地', '得', '了', '着', '过', '呢', '吗', '吧', '啊', '呀', '哦', '嗯', '哪', '什么', '怎么', '为什么', '哪里', '哪儿', '怎样'];
36
+
37
+ // Interjections
38
+ const interjections = ['哦', '啊', '呀', '哇', '嗯', '嘿', '喂', '哎', '唉', '咦', '哟', '呵', '嘻', '哈'];
39
+ export const all = [].concat(articles, pronouns, prepositions, conjunctions, auxiliaries, quantifiers, adverbs, particles, interjections);
40
+ //# sourceMappingURL=functionWords.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"functionWords.js","names":["articles","pronouns","prepositions","conjunctions","auxiliaries","quantifiers","adverbs","particles","interjections","all","concat"],"sources":["../../../../../src/languageProcessing/languages/zh/config/functionWords.js"],"sourcesContent":["/**\n * Returns an object with Chinese function words.\n *\n * @returns {Array} The array of Chinese function words.\n */\n\n// Articles and determiners\nconst articles = ['这', '那', '这个', '那个', '这些', '那些', '一个', '一些'];\n\n// Pronouns\nconst pronouns = [\n // Personal pronouns\n '我',\n '你',\n '他',\n '她',\n '它',\n '我们',\n '你们',\n '他们',\n '她们',\n '它们',\n // Possessive pronouns\n '我的',\n '你的',\n '他的',\n '她的',\n '它的',\n '我们的',\n '你们的',\n '他们的',\n '她们的',\n '它们的',\n // Demonstrative pronouns\n '这',\n '那',\n '这个',\n '那个',\n '这些',\n '那些',\n '这里',\n '那里',\n '这儿',\n '那儿'\n];\n\n// Prepositions\nconst prepositions = [\n '在',\n '从',\n '到',\n '向',\n '朝',\n '往',\n '由',\n '被',\n '把',\n '对',\n '为',\n '给',\n '跟',\n '和',\n '与',\n '同',\n '关于',\n '按照',\n '根据'\n];\n\n// Conjunctions\nconst conjunctions = [\n '和',\n '或',\n '但',\n '但是',\n '可是',\n '然而',\n '不过',\n '而且',\n '并且',\n '以及',\n '还有',\n '或者',\n '要么',\n '因为',\n '所以',\n '如果',\n '要是',\n '虽然',\n '尽管'\n];\n\n// Auxiliary verbs and modal verbs\nconst auxiliaries = [\n '是',\n '有',\n '没',\n '没有',\n '能',\n '可以',\n '会',\n '要',\n '想',\n '应该',\n '必须',\n '可能',\n '或许',\n '也许',\n '将',\n '将要',\n '正在',\n '已经',\n '刚刚'\n];\n\n// Quantifiers and numbers\nconst quantifiers = [\n '一',\n '二',\n '三',\n '四',\n '五',\n '六',\n '七',\n '八',\n '九',\n '十',\n '百',\n '千',\n '万',\n '亿',\n '个',\n '只',\n '条',\n '本',\n '张',\n '件',\n '台',\n '辆',\n '所',\n '座',\n '栋',\n '间',\n '层',\n '些',\n '点',\n '种',\n '样',\n '次',\n '遍',\n '回'\n];\n\n// Adverbs\nconst adverbs = [\n '很',\n '非常',\n '特别',\n '十分',\n '极',\n '太',\n '更',\n '最',\n '还',\n '也',\n '都',\n '只',\n '就',\n '才',\n '又',\n '再',\n '还是',\n '总是',\n '经常',\n '有时',\n '偶尔',\n '从来',\n '永远'\n];\n\n// Particles\nconst particles = [\n '的',\n '地',\n '得',\n '了',\n '着',\n '过',\n '呢',\n '吗',\n '吧',\n '啊',\n '呀',\n '哦',\n '嗯',\n '哪',\n '什么',\n '怎么',\n '为什么',\n '哪里',\n '哪儿',\n '怎样'\n];\n\n// Interjections\nconst interjections = [\n '哦',\n '啊',\n '呀',\n '哇',\n '嗯',\n '嘿',\n '喂',\n '哎',\n '唉',\n '咦',\n '哟',\n '呵',\n '嘻',\n '哈'\n];\n\nexport const all = [].concat(\n articles,\n pronouns,\n prepositions,\n conjunctions,\n auxiliaries,\n quantifiers,\n adverbs,\n particles,\n interjections\n);\n"],"mappings":"AAAA;AACA;AACA;AACA;AACA;;AAEA;AACA,MAAMA,QAAQ,GAAG,CAAC,GAAG,EAAE,GAAG,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,CAAC;;AAE/D;AACA,MAAMC,QAAQ,GAAG;AACf;AACA,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,IAAI;AACJ;AACA,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,KAAK,EACL,KAAK,EACL,KAAK,EACL,KAAK,EACL,KAAK;AACL;AACA,GAAG,EACH,GAAG,EACH,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,IAAI,CACL;;AAED;AACA,MAAMC,YAAY,GAAG,CACnB,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,IAAI,EACJ,IAAI,EACJ,IAAI,CACL;;AAED;AACA,MAAMC,YAAY,GAAG,CACnB,GAAG,EACH,GAAG,EACH,GAAG,EACH,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,IAAI,CACL;;AAED;AACA,MAAMC,WAAW,GAAG,CAClB,GAAG,EACH,GAAG,EACH,GAAG,EACH,IAAI,EACJ,GAAG,EACH,IAAI,EACJ,GAAG,EACH,GAAG,EACH,GAAG,EACH,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,GAAG,EACH,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,IAAI,CACL;;AAED;AACA,MAAMC,WAAW,GAAG,CAClB,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,CACJ;;AAED;AACA,MAAMC,OAAO,GAAG,CACd,GAAG,EACH,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,IAAI,CACL;;AAED;AACA,MAAMC,SAAS,GAAG,CAChB,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,IAAI,EACJ,IAAI,EACJ,KAAK,EACL,IAAI,EACJ,IAAI,EACJ,IAAI,CACL;;AAED;AACA,MAAMC,aAAa,GAAG,CACpB,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,CACJ;AAED,OAAO,MAAMC,GAAG,GAAG,EAAE,CAACC,MAAM,CAC1BV,QAAQ,EACRC,QAAQ,EACRC,YAAY,EACZC,YAAY,EACZC,WAAW,EACXC,WAAW,EACXC,OAAO,EACPC,SAAS,EACTC,aACF,CAAC","ignoreList":[]}
@@ -0,0 +1,42 @@
1
+ /**
2
+ * Gets sentences from Chinese text using Chinese punctuation marks.
3
+ *
4
+ * @param {string} text The text to get sentences from.
5
+ *
6
+ * @returns {string[]} An array of sentences.
7
+ */
8
+ export default function getSentences(text) {
9
+ if (!text) {
10
+ return [];
11
+ }
12
+
13
+ // Chinese sentence-ending punctuation marks
14
+ const chineseSentenceEnders = /[。!?;]/;
15
+ const englishSentenceEnders = /[.!?;]/;
16
+
17
+ // Split by Chinese and English sentence enders, keeping the punctuation
18
+ const sentences = text.split(/([。!?;.!?;])/).filter(sentence => sentence.trim() !== '');
19
+ const result = [];
20
+ let currentSentence = '';
21
+ for (let i = 0; i < sentences.length; i++) {
22
+ const part = sentences[i];
23
+ if (chineseSentenceEnders.test(part) || englishSentenceEnders.test(part)) {
24
+ // This is punctuation - add to current sentence and finalize it
25
+ currentSentence += part;
26
+ if (currentSentence.trim()) {
27
+ result.push(currentSentence.trim());
28
+ }
29
+ currentSentence = '';
30
+ } else {
31
+ // This is sentence content
32
+ currentSentence += part;
33
+ }
34
+ }
35
+
36
+ // Add any remaining sentence
37
+ if (currentSentence.trim()) {
38
+ result.push(currentSentence.trim());
39
+ }
40
+ return result.filter(sentence => sentence.length > 0);
41
+ }
42
+ //# sourceMappingURL=getSentences.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"getSentences.js","names":["getSentences","text","chineseSentenceEnders","englishSentenceEnders","sentences","split","filter","sentence","trim","result","currentSentence","i","length","part","test","push"],"sources":["../../../../../src/languageProcessing/languages/zh/helpers/getSentences.js"],"sourcesContent":["/**\n * Gets sentences from Chinese text using Chinese punctuation marks.\n *\n * @param {string} text The text to get sentences from.\n *\n * @returns {string[]} An array of sentences.\n */\nexport default function getSentences(text) {\n if (!text) {\n return [];\n }\n\n // Chinese sentence-ending punctuation marks\n const chineseSentenceEnders = /[。!?;]/;\n const englishSentenceEnders = /[.!?;]/;\n\n // Split by Chinese and English sentence enders, keeping the punctuation\n const sentences = text.split(/([。!?;.!?;])/).filter(sentence => sentence.trim() !== '');\n\n const result = [];\n let currentSentence = '';\n\n for (let i = 0; i < sentences.length; i++) {\n const part = sentences[i];\n\n if (chineseSentenceEnders.test(part) || englishSentenceEnders.test(part)) {\n // This is punctuation - add to current sentence and finalize it\n currentSentence += part;\n if (currentSentence.trim()) {\n result.push(currentSentence.trim());\n }\n currentSentence = '';\n } else {\n // This is sentence content\n currentSentence += part;\n }\n }\n\n // Add any remaining sentence\n if (currentSentence.trim()) {\n result.push(currentSentence.trim());\n }\n\n return result.filter(sentence => sentence.length > 0);\n}\n"],"mappings":"AAAA;AACA;AACA;AACA;AACA;AACA;AACA;AACA,eAAe,SAASA,YAAYA,CAACC,IAAI,EAAE;EACzC,IAAI,CAACA,IAAI,EAAE;IACT,OAAO,EAAE;EACX;;EAEA;EACA,MAAMC,qBAAqB,GAAG,QAAQ;EACtC,MAAMC,qBAAqB,GAAG,QAAQ;;EAEtC;EACA,MAAMC,SAAS,GAAGH,IAAI,CAACI,KAAK,CAAC,cAAc,CAAC,CAACC,MAAM,CAACC,QAAQ,IAAIA,QAAQ,CAACC,IAAI,CAAC,CAAC,KAAK,EAAE,CAAC;EAEvF,MAAMC,MAAM,GAAG,EAAE;EACjB,IAAIC,eAAe,GAAG,EAAE;EAExB,KAAK,IAAIC,CAAC,GAAG,CAAC,EAAEA,CAAC,GAAGP,SAAS,CAACQ,MAAM,EAAED,CAAC,EAAE,EAAE;IACzC,MAAME,IAAI,GAAGT,SAAS,CAACO,CAAC,CAAC;IAEzB,IAAIT,qBAAqB,CAACY,IAAI,CAACD,IAAI,CAAC,IAAIV,qBAAqB,CAACW,IAAI,CAACD,IAAI,CAAC,EAAE;MACxE;MACAH,eAAe,IAAIG,IAAI;MACvB,IAAIH,eAAe,CAACF,IAAI,CAAC,CAAC,EAAE;QAC1BC,MAAM,CAACM,IAAI,CAACL,eAAe,CAACF,IAAI,CAAC,CAAC,CAAC;MACrC;MACAE,eAAe,GAAG,EAAE;IACtB,CAAC,MAAM;MACL;MACAA,eAAe,IAAIG,IAAI;IACzB;EACF;;EAEA;EACA,IAAIH,eAAe,CAACF,IAAI,CAAC,CAAC,EAAE;IAC1BC,MAAM,CAACM,IAAI,CAACL,eAAe,CAACF,IAAI,CAAC,CAAC,CAAC;EACrC;EAEA,OAAOC,MAAM,CAACH,MAAM,CAACC,QAAQ,IAAIA,QAAQ,CAACK,MAAM,GAAG,CAAC,CAAC;AACvD","ignoreList":[]}
@@ -0,0 +1,35 @@
1
+ /**
2
+ * Matches a word in a text for Chinese language.
3
+ * Chinese doesn't use spaces between words, so we use exact string matching.
4
+ * This function is compatible with the matchWordCustomHelper interface.
5
+ *
6
+ * @param {string|Object} sentence The sentence to search in (can be string or sentence object).
7
+ * @param {string} word The word to search for.
8
+ *
9
+ * @returns {Array} An array of matches found in the text.
10
+ */
11
+ export default function matchTextWithWord(sentence, word) {
12
+ const matches = [];
13
+
14
+ // Handle both string and sentence object
15
+ const text = typeof sentence === 'string' ? sentence : sentence.text || sentence;
16
+ if (!text || !word) {
17
+ return matches;
18
+ }
19
+
20
+ // Convert both text and word to lowercase for case-insensitive matching
21
+ const lowerText = text.toLowerCase();
22
+ const lowerWord = word.toLowerCase();
23
+ let startIndex = 0;
24
+ let index;
25
+
26
+ // Find all occurrences of the word in the text
27
+ while ((index = lowerText.indexOf(lowerWord, startIndex)) !== -1) {
28
+ // For Chinese, we add the matched word directly as a string
29
+ // This matches the expected return format for the custom helper
30
+ matches.push(word);
31
+ startIndex = index + lowerWord.length;
32
+ }
33
+ return matches;
34
+ }
35
+ //# sourceMappingURL=matchTextWithWord.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"matchTextWithWord.js","names":["matchTextWithWord","sentence","word","matches","text","lowerText","toLowerCase","lowerWord","startIndex","index","indexOf","push","length"],"sources":["../../../../../src/languageProcessing/languages/zh/helpers/matchTextWithWord.js"],"sourcesContent":["/**\n * Matches a word in a text for Chinese language.\n * Chinese doesn't use spaces between words, so we use exact string matching.\n * This function is compatible with the matchWordCustomHelper interface.\n *\n * @param {string|Object} sentence The sentence to search in (can be string or sentence object).\n * @param {string} word The word to search for.\n *\n * @returns {Array} An array of matches found in the text.\n */\nexport default function matchTextWithWord(sentence, word) {\n const matches = [];\n\n // Handle both string and sentence object\n const text = typeof sentence === 'string' ? sentence : sentence.text || sentence;\n\n if (!text || !word) {\n return matches;\n }\n\n // Convert both text and word to lowercase for case-insensitive matching\n const lowerText = text.toLowerCase();\n const lowerWord = word.toLowerCase();\n\n let startIndex = 0;\n let index;\n\n // Find all occurrences of the word in the text\n while ((index = lowerText.indexOf(lowerWord, startIndex)) !== -1) {\n // For Chinese, we add the matched word directly as a string\n // This matches the expected return format for the custom helper\n matches.push(word);\n startIndex = index + lowerWord.length;\n }\n\n return matches;\n}\n"],"mappings":"AAAA;AACA;AACA;AACA;AACA;AACA;AACA;AACA;AACA;AACA;AACA,eAAe,SAASA,iBAAiBA,CAACC,QAAQ,EAAEC,IAAI,EAAE;EACxD,MAAMC,OAAO,GAAG,EAAE;;EAElB;EACA,MAAMC,IAAI,GAAG,OAAOH,QAAQ,KAAK,QAAQ,GAAGA,QAAQ,GAAGA,QAAQ,CAACG,IAAI,IAAIH,QAAQ;EAEhF,IAAI,CAACG,IAAI,IAAI,CAACF,IAAI,EAAE;IAClB,OAAOC,OAAO;EAChB;;EAEA;EACA,MAAME,SAAS,GAAGD,IAAI,CAACE,WAAW,CAAC,CAAC;EACpC,MAAMC,SAAS,GAAGL,IAAI,CAACI,WAAW,CAAC,CAAC;EAEpC,IAAIE,UAAU,GAAG,CAAC;EAClB,IAAIC,KAAK;;EAET;EACA,OAAO,CAACA,KAAK,GAAGJ,SAAS,CAACK,OAAO,CAACH,SAAS,EAAEC,UAAU,CAAC,MAAM,CAAC,CAAC,EAAE;IAChE;IACA;IACAL,OAAO,CAACQ,IAAI,CAACT,IAAI,CAAC;IAClBM,UAAU,GAAGC,KAAK,GAAGF,SAAS,CAACK,MAAM;EACvC;EAEA,OAAOT,OAAO;AAChB","ignoreList":[]}