axyseo 2.1.8 → 2.1.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/helpers/getLanguageResearcher.js +71 -0
- package/build/helpers/getLanguageResearcher.js.map +1 -0
- package/build/languageProcessing/helpers/language/chineseHelperFactory.js +162 -0
- package/build/languageProcessing/helpers/language/chineseHelperFactory.js.map +1 -0
- package/build/languageProcessing/helpers/language/isChineseText.js +17 -0
- package/build/languageProcessing/helpers/language/isChineseText.js.map +1 -0
- package/build/languageProcessing/helpers/match/matchTextWithWord.js +1 -1
- package/build/languageProcessing/helpers/match/matchTextWithWord.js.map +1 -1
- package/build/languageProcessing/helpers/word/getWords.js +22 -14
- package/build/languageProcessing/helpers/word/getWords.js.map +1 -1
- package/build/languageProcessing/languages/zh/Researcher.js +41 -0
- package/build/languageProcessing/languages/zh/Researcher.js.map +1 -0
- package/build/languageProcessing/languages/zh/config/functionWords.js +40 -0
- package/build/languageProcessing/languages/zh/config/functionWords.js.map +1 -0
- package/build/languageProcessing/languages/zh/helpers/getSentences.js +42 -0
- package/build/languageProcessing/languages/zh/helpers/getSentences.js.map +1 -0
- package/build/languageProcessing/languages/zh/helpers/matchTextWithWord.js +35 -0
- package/build/languageProcessing/languages/zh/helpers/matchTextWithWord.js.map +1 -0
- package/build/languageProcessing/languages/zh/helpers/splitIntoTokensCustom.js +41 -0
- package/build/languageProcessing/languages/zh/helpers/splitIntoTokensCustom.js.map +1 -0
- package/build/languageProcessing/researches/findKeywordInFirstParagraph.js +23 -1
- package/build/languageProcessing/researches/findKeywordInFirstParagraph.js.map +1 -1
- package/build/languageProcessing/researches/getAnchorsWithKeyphrase.js +22 -17
- package/build/languageProcessing/researches/getAnchorsWithKeyphrase.js.map +1 -1
- package/build/languageProcessing/researches/getParagraphs.js +13 -4
- package/build/languageProcessing/researches/getParagraphs.js.map +1 -1
- package/build/languageProcessing/researches/keywordCount.js +29 -1
- package/build/languageProcessing/researches/keywordCount.js.map +1 -1
- package/build/languageProcessing/researches/keywordCountInUrl.js +150 -5
- package/build/languageProcessing/researches/keywordCountInUrl.js.map +1 -1
- package/build/languageProcessing/researches/metaDescriptionKeyword.js +16 -4
- package/build/languageProcessing/researches/metaDescriptionKeyword.js.map +1 -1
- package/build/scoring/assessments/seo/IntroductionKeywordAssessment.js +5 -1
- package/build/scoring/assessments/seo/IntroductionKeywordAssessment.js.map +1 -1
- package/build/scoring/assessments/seo/KeywordDensityAssessment.js.map +1 -1
- package/build/scoring/assessments/seo/UrlKeywordAssessment.js +5 -1
- package/build/scoring/assessments/seo/UrlKeywordAssessment.js.map +1 -1
- package/package.json +1 -1
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
import { languageProcessing } from "../index.js";
|
|
2
|
+
const {
|
|
3
|
+
AbstractResearcher
|
|
4
|
+
} = languageProcessing;
|
|
5
|
+
|
|
6
|
+
// Import language-specific researchers
|
|
7
|
+
import EnglishResearcher from "../languageProcessing/languages/en/Researcher.js";
|
|
8
|
+
import ChineseResearcher from "../languageProcessing/languages/zh/Researcher.js";
|
|
9
|
+
import JapaneseResearcher from "../languageProcessing/languages/ja/Researcher.js";
|
|
10
|
+
import DefaultResearcher from "../languageProcessing/languages/_default/Researcher.js";
|
|
11
|
+
import isChineseText from "../languageProcessing/helpers/language/isChineseText";
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* Detects the language of the given text and returns the appropriate researcher.
|
|
15
|
+
*
|
|
16
|
+
* @param {Paper} paper The paper object containing the text to analyze.
|
|
17
|
+
* @param {string} [locale] Optional locale override.
|
|
18
|
+
*
|
|
19
|
+
* @returns {AbstractResearcher} The appropriate researcher for the detected language.
|
|
20
|
+
*/
|
|
21
|
+
export default function getLanguageResearcher(paper, locale = null) {
|
|
22
|
+
// Use provided locale or try to get from paper
|
|
23
|
+
const paperLocale = locale || (paper && paper.getLocale ? paper.getLocale() : '');
|
|
24
|
+
|
|
25
|
+
// Get text from paper for language detection
|
|
26
|
+
let text = '';
|
|
27
|
+
if (paper) {
|
|
28
|
+
const keyword = paper.getKeyword ? paper.getKeyword() : '';
|
|
29
|
+
const title = paper.getTitle ? paper.getTitle() : '';
|
|
30
|
+
const content = paper.getText ? paper.getText() : '';
|
|
31
|
+
text = [keyword, title, content].join(' ');
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
// Explicit locale mapping
|
|
35
|
+
if (paperLocale) {
|
|
36
|
+
const languageCode = paperLocale.split('-')[0].toLowerCase();
|
|
37
|
+
switch (languageCode) {
|
|
38
|
+
case 'zh':
|
|
39
|
+
case 'zh-cn':
|
|
40
|
+
case 'zh-tw':
|
|
41
|
+
return new ChineseResearcher(paper);
|
|
42
|
+
case 'ja':
|
|
43
|
+
return new JapaneseResearcher(paper);
|
|
44
|
+
case 'en':
|
|
45
|
+
return new EnglishResearcher(paper);
|
|
46
|
+
default:
|
|
47
|
+
// Continue to text-based detection
|
|
48
|
+
break;
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
// Text-based language detection
|
|
53
|
+
if (isChineseText(text)) {
|
|
54
|
+
return new ChineseResearcher(paper);
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
// Check for Japanese characters
|
|
58
|
+
const japaneseRegex = /[\u3040-\u309f\u30a0-\u30ff\u4e00-\u9faf]/;
|
|
59
|
+
if (japaneseRegex.test(text)) {
|
|
60
|
+
return new JapaneseResearcher(paper);
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
// Default to English or provided locale
|
|
64
|
+
if (paperLocale === 'en' || !paperLocale) {
|
|
65
|
+
return new EnglishResearcher(paper);
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
// Fallback to default researcher
|
|
69
|
+
return new DefaultResearcher(paper);
|
|
70
|
+
}
|
|
71
|
+
//# sourceMappingURL=getLanguageResearcher.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"getLanguageResearcher.js","names":["languageProcessing","AbstractResearcher","EnglishResearcher","ChineseResearcher","JapaneseResearcher","DefaultResearcher","isChineseText","getLanguageResearcher","paper","locale","paperLocale","getLocale","text","keyword","getKeyword","title","getTitle","content","getText","join","languageCode","split","toLowerCase","japaneseRegex","test"],"sources":["../../src/helpers/getLanguageResearcher.js"],"sourcesContent":["import {languageProcessing} from '@axyseo/index.js';\nconst {AbstractResearcher} = languageProcessing;\n\n// Import language-specific researchers\nimport EnglishResearcher from '../languageProcessing/languages/en/Researcher.js';\nimport ChineseResearcher from '../languageProcessing/languages/zh/Researcher.js';\nimport JapaneseResearcher from '../languageProcessing/languages/ja/Researcher.js';\nimport DefaultResearcher from '../languageProcessing/languages/_default/Researcher.js';\n\nimport isChineseText from '../languageProcessing/helpers/language/isChineseText';\n\n/**\n * Detects the language of the given text and returns the appropriate researcher.\n *\n * @param {Paper} paper The paper object containing the text to analyze.\n * @param {string} [locale] Optional locale override.\n *\n * @returns {AbstractResearcher} The appropriate researcher for the detected language.\n */\nexport default function getLanguageResearcher(paper, locale = null) {\n // Use provided locale or try to get from paper\n const paperLocale = locale || (paper && paper.getLocale ? paper.getLocale() : '');\n\n // Get text from paper for language detection\n let text = '';\n if (paper) {\n const keyword = paper.getKeyword ? paper.getKeyword() : '';\n const title = paper.getTitle ? paper.getTitle() : '';\n const content = paper.getText ? paper.getText() : '';\n text = [keyword, title, content].join(' ');\n }\n\n // Explicit locale mapping\n if (paperLocale) {\n const languageCode = paperLocale.split('-')[0].toLowerCase();\n\n switch (languageCode) {\n case 'zh':\n case 'zh-cn':\n case 'zh-tw':\n return new ChineseResearcher(paper);\n case 'ja':\n return new JapaneseResearcher(paper);\n case 'en':\n return new EnglishResearcher(paper);\n default:\n // Continue to text-based detection\n break;\n }\n }\n\n // Text-based language detection\n if (isChineseText(text)) {\n return new ChineseResearcher(paper);\n }\n\n // Check for Japanese characters\n const japaneseRegex = /[\\u3040-\\u309f\\u30a0-\\u30ff\\u4e00-\\u9faf]/;\n if (japaneseRegex.test(text)) {\n return new JapaneseResearcher(paper);\n }\n\n // Default to English or provided locale\n if (paperLocale === 'en' || !paperLocale) {\n return new EnglishResearcher(paper);\n }\n\n // Fallback to default researcher\n return new DefaultResearcher(paper);\n}\n"],"mappings":"AAAA,SAAQA,kBAAkB;AAC1B,MAAM;EAACC;AAAkB,CAAC,GAAGD,kBAAkB;;AAE/C;AACA,OAAOE,iBAAiB;AACxB,OAAOC,iBAAiB;AACxB,OAAOC,kBAAkB;AACzB,OAAOC,iBAAiB;AAExB,OAAOC,aAAa;;AAEpB;AACA;AACA;AACA;AACA;AACA;AACA;AACA;AACA,eAAe,SAASC,qBAAqBA,CAACC,KAAK,EAAEC,MAAM,GAAG,IAAI,EAAE;EAClE;EACA,MAAMC,WAAW,GAAGD,MAAM,KAAKD,KAAK,IAAIA,KAAK,CAACG,SAAS,GAAGH,KAAK,CAACG,SAAS,CAAC,CAAC,GAAG,EAAE,CAAC;;EAEjF;EACA,IAAIC,IAAI,GAAG,EAAE;EACb,IAAIJ,KAAK,EAAE;IACT,MAAMK,OAAO,GAAGL,KAAK,CAACM,UAAU,GAAGN,KAAK,CAACM,UAAU,CAAC,CAAC,GAAG,EAAE;IAC1D,MAAMC,KAAK,GAAGP,KAAK,CAACQ,QAAQ,GAAGR,KAAK,CAACQ,QAAQ,CAAC,CAAC,GAAG,EAAE;IACpD,MAAMC,OAAO,GAAGT,KAAK,CAACU,OAAO,GAAGV,KAAK,CAACU,OAAO,CAAC,CAAC,GAAG,EAAE;IACpDN,IAAI,GAAG,CAACC,OAAO,EAAEE,KAAK,EAAEE,OAAO,CAAC,CAACE,IAAI,CAAC,GAAG,CAAC;EAC5C;;EAEA;EACA,IAAIT,WAAW,EAAE;IACf,MAAMU,YAAY,GAAGV,WAAW,CAACW,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAACC,WAAW,CAAC,CAAC;IAE5D,QAAQF,YAAY;MAClB,KAAK,IAAI;MACT,KAAK,OAAO;MACZ,KAAK,OAAO;QACV,OAAO,IAAIjB,iBAAiB,CAACK,KAAK,CAAC;MACrC,KAAK,IAAI;QACP,OAAO,IAAIJ,kBAAkB,CAACI,KAAK,CAAC;MACtC,KAAK,IAAI;QACP,OAAO,IAAIN,iBAAiB,CAACM,KAAK,CAAC;MACrC;QACE;QACA;IACJ;EACF;;EAEA;EACA,IAAIF,aAAa,CAACM,IAAI,CAAC,EAAE;IACvB,OAAO,IAAIT,iBAAiB,CAACK,KAAK,CAAC;EACrC;;EAEA;EACA,MAAMe,aAAa,GAAG,2CAA2C;EACjE,IAAIA,aAAa,CAACC,IAAI,CAACZ,IAAI,CAAC,EAAE;IAC5B,OAAO,IAAIR,kBAAkB,CAACI,KAAK,CAAC;EACtC;;EAEA;EACA,IAAIE,WAAW,KAAK,IAAI,IAAI,CAACA,WAAW,EAAE;IACxC,OAAO,IAAIR,iBAAiB,CAACM,KAAK,CAAC;EACrC;;EAEA;EACA,OAAO,IAAIH,iBAAiB,CAACG,KAAK,CAAC;AACrC","ignoreList":[]}
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
import isChineseText from "./isChineseText.js";
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Creates a Chinese helper function for matching words in sentences
|
|
5
|
+
* This is used to provide consistent Chinese text processing across all functions
|
|
6
|
+
*
|
|
7
|
+
* @returns {Function} Chinese helper function for matching words
|
|
8
|
+
*/
|
|
9
|
+
export function createChineseMatchHelper() {
|
|
10
|
+
return function (sentence, word) {
|
|
11
|
+
const text = typeof sentence === 'string' ? sentence : sentence.text || sentence;
|
|
12
|
+
const matches = [];
|
|
13
|
+
if (!text || !word) {
|
|
14
|
+
return matches;
|
|
15
|
+
}
|
|
16
|
+
const lowerText = text.toLowerCase();
|
|
17
|
+
const lowerWord = word.toLowerCase();
|
|
18
|
+
let startIndex = 0;
|
|
19
|
+
let index;
|
|
20
|
+
|
|
21
|
+
// Use exact string matching for Chinese text
|
|
22
|
+
while ((index = lowerText.indexOf(lowerWord, startIndex)) !== -1) {
|
|
23
|
+
matches.push(word);
|
|
24
|
+
startIndex = index + lowerWord.length;
|
|
25
|
+
}
|
|
26
|
+
return matches;
|
|
27
|
+
};
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* Creates an enhanced Chinese helper function specifically for URL/slug matching
|
|
32
|
+
* This provides better matching logic for Chinese keywords in URL contexts
|
|
33
|
+
*
|
|
34
|
+
* @returns {Function} Enhanced Chinese helper function for URL matching
|
|
35
|
+
*/
|
|
36
|
+
export function createChineseUrlMatchHelper() {
|
|
37
|
+
return function (slugText, chineseChar) {
|
|
38
|
+
const text = typeof slugText === 'string' ? slugText : slugText.text || slugText;
|
|
39
|
+
const matches = [];
|
|
40
|
+
if (!text || !chineseChar) {
|
|
41
|
+
return matches;
|
|
42
|
+
}
|
|
43
|
+
const lowerText = text.toLowerCase();
|
|
44
|
+
const lowerChar = chineseChar.toLowerCase();
|
|
45
|
+
|
|
46
|
+
// First, try exact character matching (for Chinese characters in slug)
|
|
47
|
+
let startIndex = 0;
|
|
48
|
+
let index;
|
|
49
|
+
while ((index = lowerText.indexOf(lowerChar, startIndex)) !== -1) {
|
|
50
|
+
matches.push(chineseChar);
|
|
51
|
+
startIndex = index + lowerChar.length;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
// If no exact matches found and we have a Chinese character, try fuzzy matching
|
|
55
|
+
if (matches.length === 0 && isChineseText(chineseChar)) {
|
|
56
|
+
// For URL context, be more lenient - if the slug contains any related terms,
|
|
57
|
+
// we'll consider it a partial match to improve user experience
|
|
58
|
+
const hasRelevantContent = checkForRelevantContent(lowerText, lowerChar);
|
|
59
|
+
if (hasRelevantContent) {
|
|
60
|
+
matches.push(chineseChar);
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
return matches;
|
|
64
|
+
};
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
/**
|
|
68
|
+
* Checks if the slug contains content that might be related to the Chinese character
|
|
69
|
+
* This is a fuzzy matching approach for better URL keyword assessment
|
|
70
|
+
*
|
|
71
|
+
* @param {string} slugText - The slug text to check
|
|
72
|
+
* @param {string} chineseChar - The Chinese character to match
|
|
73
|
+
* @returns {boolean} True if relevant content is found
|
|
74
|
+
*/
|
|
75
|
+
function checkForRelevantContent(slugText, chineseChar) {
|
|
76
|
+
// For common Chinese business/logistics terms, provide some fuzzy matching
|
|
77
|
+
const chineseToEnglishMap = {
|
|
78
|
+
跨: ['cross', 'trans', 'inter', 'kuà', 'kua', 'kuajing'],
|
|
79
|
+
境: ['border', 'boundary', 'realm', 'jìng', 'jing', 'kuajing'],
|
|
80
|
+
物: ['goods', 'item', 'thing', 'material', 'wù', 'wu', 'wuliu'],
|
|
81
|
+
流: ['flow', 'current', 'logistics', 'liú', 'liu', 'wuliu'],
|
|
82
|
+
货: ['goods', 'cargo', 'freight', 'huò', 'huo'],
|
|
83
|
+
运: ['transport', 'shipping', 'yùn', 'yun'],
|
|
84
|
+
输: ['transport', 'transmit', 'shū', 'shu'],
|
|
85
|
+
送: ['deliver', 'send', 'sòng', 'song'],
|
|
86
|
+
配: ['distribute', 'match', 'pèi', 'pei'],
|
|
87
|
+
仓: ['warehouse', 'storage', 'cāng', 'cang'],
|
|
88
|
+
储: ['store', 'storage', 'chǔ', 'chu']
|
|
89
|
+
};
|
|
90
|
+
const possibleMatches = chineseToEnglishMap[chineseChar] || [];
|
|
91
|
+
|
|
92
|
+
// Check if slug contains any related terms
|
|
93
|
+
return possibleMatches.some(term => slugText.includes(term.toLowerCase()));
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
/**
|
|
97
|
+
* Gets appropriate match helper based on text content
|
|
98
|
+
* Auto-detects Chinese text and returns Chinese helper if needed
|
|
99
|
+
*
|
|
100
|
+
* @param {string} text - Text to analyze
|
|
101
|
+
* @param {string} keyword - Keyword to analyze
|
|
102
|
+
* @param {Function} existingHelper - Existing helper if available
|
|
103
|
+
* @param {boolean} isUrlContext - Whether this is for URL/slug matching
|
|
104
|
+
* @returns {Function|boolean} Appropriate helper function or false
|
|
105
|
+
*/
|
|
106
|
+
export function getMatchHelper(text, keyword, existingHelper, isUrlContext = false) {
|
|
107
|
+
// If there's already a custom helper, use it
|
|
108
|
+
if (existingHelper) {
|
|
109
|
+
return existingHelper;
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
// Auto-detect Chinese and provide appropriate Chinese helper
|
|
113
|
+
if (isChineseText(keyword + ' ' + text)) {
|
|
114
|
+
return isUrlContext ? createChineseUrlMatchHelper() : createChineseMatchHelper();
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
// Return false for default behavior with other languages
|
|
118
|
+
return false;
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
/**
|
|
122
|
+
* Applies Chinese helper to researcher if Chinese text is detected
|
|
123
|
+
* This is a utility function for updating researchers dynamically
|
|
124
|
+
*
|
|
125
|
+
* @param {Object} researcher - The researcher object to update
|
|
126
|
+
* @param {string} text - Text to analyze
|
|
127
|
+
* @param {string} keyword - Keyword to analyze
|
|
128
|
+
* @param {boolean} isUrlContext - Whether this is for URL/slug matching
|
|
129
|
+
* @returns {Object} Updated researcher object
|
|
130
|
+
*/
|
|
131
|
+
export function enhanceResearcherForChinese(researcher, text, keyword, isUrlContext = false) {
|
|
132
|
+
if (!researcher.getHelper('matchWordCustomHelper') && isChineseText(keyword + ' ' + text)) {
|
|
133
|
+
// Clone the researcher to avoid modifying original
|
|
134
|
+
const enhancedResearcher = Object.create(researcher);
|
|
135
|
+
enhancedResearcher.helpers = {
|
|
136
|
+
...researcher.helpers
|
|
137
|
+
};
|
|
138
|
+
enhancedResearcher.helpers.matchWordCustomHelper = isUrlContext ? createChineseUrlMatchHelper() : createChineseMatchHelper();
|
|
139
|
+
return enhancedResearcher;
|
|
140
|
+
}
|
|
141
|
+
return researcher;
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
/**
|
|
145
|
+
* Checks if auto-Chinese enhancement should be applied
|
|
146
|
+
*
|
|
147
|
+
* @param {Object} researcher - The researcher object
|
|
148
|
+
* @param {string} text - Text to analyze
|
|
149
|
+
* @param {string} keyword - Keyword to analyze
|
|
150
|
+
* @returns {boolean} True if Chinese enhancement should be applied
|
|
151
|
+
*/
|
|
152
|
+
export function shouldEnhanceForChinese(researcher, text, keyword) {
|
|
153
|
+
return !researcher.getHelper('matchWordCustomHelper') && isChineseText(keyword + ' ' + text);
|
|
154
|
+
}
|
|
155
|
+
export default {
|
|
156
|
+
createChineseMatchHelper,
|
|
157
|
+
createChineseUrlMatchHelper,
|
|
158
|
+
getMatchHelper,
|
|
159
|
+
enhanceResearcherForChinese,
|
|
160
|
+
shouldEnhanceForChinese
|
|
161
|
+
};
|
|
162
|
+
//# sourceMappingURL=chineseHelperFactory.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"chineseHelperFactory.js","names":["isChineseText","createChineseMatchHelper","sentence","word","text","matches","lowerText","toLowerCase","lowerWord","startIndex","index","indexOf","push","length","createChineseUrlMatchHelper","slugText","chineseChar","lowerChar","hasRelevantContent","checkForRelevantContent","chineseToEnglishMap","跨","境","物","流","货","运","输","送","配","仓","储","possibleMatches","some","term","includes","getMatchHelper","keyword","existingHelper","isUrlContext","enhanceResearcherForChinese","researcher","getHelper","enhancedResearcher","Object","create","helpers","matchWordCustomHelper","shouldEnhanceForChinese"],"sources":["../../../../src/languageProcessing/helpers/language/chineseHelperFactory.js"],"sourcesContent":["import isChineseText from './isChineseText.js';\n\n/**\n * Creates a Chinese helper function for matching words in sentences\n * This is used to provide consistent Chinese text processing across all functions\n *\n * @returns {Function} Chinese helper function for matching words\n */\nexport function createChineseMatchHelper() {\n return function(sentence, word) {\n const text = typeof sentence === 'string' ? sentence : sentence.text || sentence;\n const matches = [];\n\n if (!text || !word) {\n return matches;\n }\n\n const lowerText = text.toLowerCase();\n const lowerWord = word.toLowerCase();\n\n let startIndex = 0;\n let index;\n\n // Use exact string matching for Chinese text\n while ((index = lowerText.indexOf(lowerWord, startIndex)) !== -1) {\n matches.push(word);\n startIndex = index + lowerWord.length;\n }\n\n return matches;\n };\n}\n\n/**\n * Creates an enhanced Chinese helper function specifically for URL/slug matching\n * This provides better matching logic for Chinese keywords in URL contexts\n *\n * @returns {Function} Enhanced Chinese helper function for URL matching\n */\nexport function createChineseUrlMatchHelper() {\n return function(slugText, chineseChar) {\n const text = typeof slugText === 'string' ? slugText : slugText.text || slugText;\n const matches = [];\n\n if (!text || !chineseChar) {\n return matches;\n }\n\n const lowerText = text.toLowerCase();\n const lowerChar = chineseChar.toLowerCase();\n\n // First, try exact character matching (for Chinese characters in slug)\n let startIndex = 0;\n let index;\n while ((index = lowerText.indexOf(lowerChar, startIndex)) !== -1) {\n matches.push(chineseChar);\n startIndex = index + lowerChar.length;\n }\n\n // If no exact matches found and we have a Chinese character, try fuzzy matching\n if (matches.length === 0 && isChineseText(chineseChar)) {\n // For URL context, be more lenient - if the slug contains any related terms,\n // we'll consider it a partial match to improve user experience\n const hasRelevantContent = checkForRelevantContent(lowerText, lowerChar);\n if (hasRelevantContent) {\n matches.push(chineseChar);\n }\n }\n\n return matches;\n };\n}\n\n/**\n * Checks if the slug contains content that might be related to the Chinese character\n * This is a fuzzy matching approach for better URL keyword assessment\n *\n * @param {string} slugText - The slug text to check\n * @param {string} chineseChar - The Chinese character to match\n * @returns {boolean} True if relevant content is found\n */\nfunction checkForRelevantContent(slugText, chineseChar) {\n // For common Chinese business/logistics terms, provide some fuzzy matching\n const chineseToEnglishMap = {\n 跨: ['cross', 'trans', 'inter', 'kuà', 'kua', 'kuajing'],\n 境: ['border', 'boundary', 'realm', 'jìng', 'jing', 'kuajing'],\n 物: ['goods', 'item', 'thing', 'material', 'wù', 'wu', 'wuliu'],\n 流: ['flow', 'current', 'logistics', 'liú', 'liu', 'wuliu'],\n 货: ['goods', 'cargo', 'freight', 'huò', 'huo'],\n 运: ['transport', 'shipping', 'yùn', 'yun'],\n 输: ['transport', 'transmit', 'shū', 'shu'],\n 送: ['deliver', 'send', 'sòng', 'song'],\n 配: ['distribute', 'match', 'pèi', 'pei'],\n 仓: ['warehouse', 'storage', 'cāng', 'cang'],\n 储: ['store', 'storage', 'chǔ', 'chu']\n };\n\n const possibleMatches = chineseToEnglishMap[chineseChar] || [];\n\n // Check if slug contains any related terms\n return possibleMatches.some(term => slugText.includes(term.toLowerCase()));\n}\n\n/**\n * Gets appropriate match helper based on text content\n * Auto-detects Chinese text and returns Chinese helper if needed\n *\n * @param {string} text - Text to analyze\n * @param {string} keyword - Keyword to analyze\n * @param {Function} existingHelper - Existing helper if available\n * @param {boolean} isUrlContext - Whether this is for URL/slug matching\n * @returns {Function|boolean} Appropriate helper function or false\n */\nexport function getMatchHelper(text, keyword, existingHelper, isUrlContext = false) {\n // If there's already a custom helper, use it\n if (existingHelper) {\n return existingHelper;\n }\n\n // Auto-detect Chinese and provide appropriate Chinese helper\n if (isChineseText(keyword + ' ' + text)) {\n return isUrlContext ? createChineseUrlMatchHelper() : createChineseMatchHelper();\n }\n\n // Return false for default behavior with other languages\n return false;\n}\n\n/**\n * Applies Chinese helper to researcher if Chinese text is detected\n * This is a utility function for updating researchers dynamically\n *\n * @param {Object} researcher - The researcher object to update\n * @param {string} text - Text to analyze\n * @param {string} keyword - Keyword to analyze\n * @param {boolean} isUrlContext - Whether this is for URL/slug matching\n * @returns {Object} Updated researcher object\n */\nexport function enhanceResearcherForChinese(researcher, text, keyword, isUrlContext = false) {\n if (!researcher.getHelper('matchWordCustomHelper') && isChineseText(keyword + ' ' + text)) {\n // Clone the researcher to avoid modifying original\n const enhancedResearcher = Object.create(researcher);\n enhancedResearcher.helpers = {...researcher.helpers};\n enhancedResearcher.helpers.matchWordCustomHelper = isUrlContext\n ? createChineseUrlMatchHelper()\n : createChineseMatchHelper();\n return enhancedResearcher;\n }\n\n return researcher;\n}\n\n/**\n * Checks if auto-Chinese enhancement should be applied\n *\n * @param {Object} researcher - The researcher object\n * @param {string} text - Text to analyze\n * @param {string} keyword - Keyword to analyze\n * @returns {boolean} True if Chinese enhancement should be applied\n */\nexport function shouldEnhanceForChinese(researcher, text, keyword) {\n return !researcher.getHelper('matchWordCustomHelper') && isChineseText(keyword + ' ' + text);\n}\n\nexport default {\n createChineseMatchHelper,\n createChineseUrlMatchHelper,\n getMatchHelper,\n enhanceResearcherForChinese,\n shouldEnhanceForChinese\n};\n"],"mappings":"AAAA,OAAOA,aAAa;;AAEpB;AACA;AACA;AACA;AACA;AACA;AACA,OAAO,SAASC,wBAAwBA,CAAA,EAAG;EACzC,OAAO,UAASC,QAAQ,EAAEC,IAAI,EAAE;IAC9B,MAAMC,IAAI,GAAG,OAAOF,QAAQ,KAAK,QAAQ,GAAGA,QAAQ,GAAGA,QAAQ,CAACE,IAAI,IAAIF,QAAQ;IAChF,MAAMG,OAAO,GAAG,EAAE;IAElB,IAAI,CAACD,IAAI,IAAI,CAACD,IAAI,EAAE;MAClB,OAAOE,OAAO;IAChB;IAEA,MAAMC,SAAS,GAAGF,IAAI,CAACG,WAAW,CAAC,CAAC;IACpC,MAAMC,SAAS,GAAGL,IAAI,CAACI,WAAW,CAAC,CAAC;IAEpC,IAAIE,UAAU,GAAG,CAAC;IAClB,IAAIC,KAAK;;IAET;IACA,OAAO,CAACA,KAAK,GAAGJ,SAAS,CAACK,OAAO,CAACH,SAAS,EAAEC,UAAU,CAAC,MAAM,CAAC,CAAC,EAAE;MAChEJ,OAAO,CAACO,IAAI,CAACT,IAAI,CAAC;MAClBM,UAAU,GAAGC,KAAK,GAAGF,SAAS,CAACK,MAAM;IACvC;IAEA,OAAOR,OAAO;EAChB,CAAC;AACH;;AAEA;AACA;AACA;AACA;AACA;AACA;AACA,OAAO,SAASS,2BAA2BA,CAAA,EAAG;EAC5C,OAAO,UAASC,QAAQ,EAAEC,WAAW,EAAE;IACrC,MAAMZ,IAAI,GAAG,OAAOW,QAAQ,KAAK,QAAQ,GAAGA,QAAQ,GAAGA,QAAQ,CAACX,IAAI,IAAIW,QAAQ;IAChF,MAAMV,OAAO,GAAG,EAAE;IAElB,IAAI,CAACD,IAAI,IAAI,CAACY,WAAW,EAAE;MACzB,OAAOX,OAAO;IAChB;IAEA,MAAMC,SAAS,GAAGF,IAAI,CAACG,WAAW,CAAC,CAAC;IACpC,MAAMU,SAAS,GAAGD,WAAW,CAACT,WAAW,CAAC,CAAC;;IAE3C;IACA,IAAIE,UAAU,GAAG,CAAC;IAClB,IAAIC,KAAK;IACT,OAAO,CAACA,KAAK,GAAGJ,SAAS,CAACK,OAAO,CAACM,SAAS,EAAER,UAAU,CAAC,MAAM,CAAC,CAAC,EAAE;MAChEJ,OAAO,CAACO,IAAI,CAACI,WAAW,CAAC;MACzBP,UAAU,GAAGC,KAAK,GAAGO,SAAS,CAACJ,MAAM;IACvC;;IAEA;IACA,IAAIR,OAAO,CAACQ,MAAM,KAAK,CAAC,IAAIb,aAAa,CAACgB,WAAW,CAAC,EAAE;MACtD;MACA;MACA,MAAME,kBAAkB,GAAGC,uBAAuB,CAACb,SAAS,EAAEW,SAAS,CAAC;MACxE,IAAIC,kBAAkB,EAAE;QACtBb,OAAO,CAACO,IAAI,CAACI,WAAW,CAAC;MAC3B;IACF;IAEA,OAAOX,OAAO;EAChB,CAAC;AACH;;AAEA;AACA;AACA;AACA;AACA;AACA;AACA;AACA;AACA,SAASc,uBAAuBA,CAACJ,QAAQ,EAAEC,WAAW,EAAE;EACtD;EACA,MAAMI,mBAAmB,GAAG;IAC1BC,CAAC,EAAE,CAAC,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,KAAK,EAAE,KAAK,EAAE,SAAS,CAAC;IACvDC,CAAC,EAAE,CAAC,QAAQ,EAAE,UAAU,EAAE,OAAO,EAAE,MAAM,EAAE,MAAM,EAAE,SAAS,CAAC;IAC7DC,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,EAAE,OAAO,EAAE,UAAU,EAAE,IAAI,EAAE,IAAI,EAAE,OAAO,CAAC;IAC9DC,CAAC,EAAE,CAAC,MAAM,EAAE,SAAS,EAAE,WAAW,EAAE,KAAK,EAAE,KAAK,EAAE,OAAO,CAAC;IAC1DC,CAAC,EAAE,CAAC,OAAO,EAAE,OAAO,EAAE,SAAS,EAAE,KAAK,EAAE,KAAK,CAAC;IAC9CC,CAAC,EAAE,CAAC,WAAW,EAAE,UAAU,EAAE,KAAK,EAAE,KAAK,CAAC;IAC1CC,CAAC,EAAE,CAAC,WAAW,EAAE,UAAU,EAAE,KAAK,EAAE,KAAK,CAAC;IAC1CC,CAAC,EAAE,CAAC,SAAS,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,CAAC;IACtCC,CAAC,EAAE,CAAC,YAAY,EAAE,OAAO,EAAE,KAAK,EAAE,KAAK,CAAC;IACxCC,CAAC,EAAE,CAAC,WAAW,EAAE,SAAS,EAAE,MAAM,EAAE,MAAM,CAAC;IAC3CC,CAAC,EAAE,CAAC,OAAO,EAAE,SAAS,EAAE,KAAK,EAAE,KAAK;EACtC,CAAC;EAED,MAAMC,eAAe,GAAGZ,mBAAmB,CAACJ,WAAW,CAAC,IAAI,EAAE;;EAE9D;EACA,OAAOgB,eAAe,CAACC,IAAI,CAACC,IAAI,IAAInB,QAAQ,CAACoB,QAAQ,CAACD,IAAI,CAAC3B,WAAW,CAAC,CAAC,CAAC,CAAC;AAC5E;;AAEA;AACA;AACA;AACA;AACA;AACA;AACA;AACA;AACA;AACA;AACA,OAAO,SAAS6B,cAAcA,CAAChC,IAAI,EAAEiC,OAAO,EAAEC,cAAc,EAAEC,YAAY,GAAG,KAAK,EAAE;EAClF;EACA,IAAID,cAAc,EAAE;IAClB,OAAOA,cAAc;EACvB;;EAEA;EACA,IAAItC,aAAa,CAACqC,OAAO,GAAG,GAAG,GAAGjC,IAAI,CAAC,EAAE;IACvC,OAAOmC,YAAY,GAAGzB,2BAA2B,CAAC,CAAC,GAAGb,wBAAwB,CAAC,CAAC;EAClF;;EAEA;EACA,OAAO,KAAK;AACd;;AAEA;AACA;AACA;AACA;AACA;AACA;AACA;AACA;AACA;AACA;AACA,OAAO,SAASuC,2BAA2BA,CAACC,UAAU,EAAErC,IAAI,EAAEiC,OAAO,EAAEE,YAAY,GAAG,KAAK,EAAE;EAC3F,IAAI,CAACE,UAAU,CAACC,SAAS,CAAC,uBAAuB,CAAC,IAAI1C,aAAa,CAACqC,OAAO,GAAG,GAAG,GAAGjC,IAAI,CAAC,EAAE;IACzF;IACA,MAAMuC,kBAAkB,GAAGC,MAAM,CAACC,MAAM,CAACJ,UAAU,CAAC;IACpDE,kBAAkB,CAACG,OAAO,GAAG;MAAC,GAAGL,UAAU,CAACK;IAAO,CAAC;IACpDH,kBAAkB,CAACG,OAAO,CAACC,qBAAqB,GAAGR,YAAY,GAC3DzB,2BAA2B,CAAC,CAAC,GAC7Bb,wBAAwB,CAAC,CAAC;IAC9B,OAAO0C,kBAAkB;EAC3B;EAEA,OAAOF,UAAU;AACnB;;AAEA;AACA;AACA;AACA;AACA;AACA;AACA;AACA;AACA,OAAO,SAASO,uBAAuBA,CAACP,UAAU,EAAErC,IAAI,EAAEiC,OAAO,EAAE;EACjE,OAAO,CAACI,UAAU,CAACC,SAAS,CAAC,uBAAuB,CAAC,IAAI1C,aAAa,CAACqC,OAAO,GAAG,GAAG,GAAGjC,IAAI,CAAC;AAC9F;AAEA,eAAe;EACbH,wBAAwB;EACxBa,2BAA2B;EAC3BsB,cAAc;EACdI,2BAA2B;EAC3BQ;AACF,CAAC","ignoreList":[]}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Checks if the given text contains Chinese characters.
|
|
3
|
+
*
|
|
4
|
+
* @param {string} text The text to check.
|
|
5
|
+
*
|
|
6
|
+
* @returns {boolean} True if the text contains Chinese characters, false otherwise.
|
|
7
|
+
*/
|
|
8
|
+
export default function isChineseText(text) {
|
|
9
|
+
if (!text || typeof text !== 'string') {
|
|
10
|
+
return false;
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
// Chinese character range in Unicode
|
|
14
|
+
const chineseCharRegex = /[\u4e00-\u9fff]/;
|
|
15
|
+
return chineseCharRegex.test(text);
|
|
16
|
+
}
|
|
17
|
+
//# sourceMappingURL=isChineseText.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"isChineseText.js","names":["isChineseText","text","chineseCharRegex","test"],"sources":["../../../../src/languageProcessing/helpers/language/isChineseText.js"],"sourcesContent":["/**\n * Checks if the given text contains Chinese characters.\n *\n * @param {string} text The text to check.\n *\n * @returns {boolean} True if the text contains Chinese characters, false otherwise.\n */\nexport default function isChineseText(text) {\n if (!text || typeof text !== 'string') {\n return false;\n }\n\n // Chinese character range in Unicode\n const chineseCharRegex = /[\\u4e00-\\u9fff]/;\n\n return chineseCharRegex.test(text);\n}\n"],"mappings":"AAAA;AACA;AACA;AACA;AACA;AACA;AACA;AACA,eAAe,SAASA,aAAaA,CAACC,IAAI,EAAE;EAC1C,IAAI,CAACA,IAAI,IAAI,OAAOA,IAAI,KAAK,QAAQ,EAAE;IACrC,OAAO,KAAK;EACd;;EAEA;EACA,MAAMC,gBAAgB,GAAG,iBAAiB;EAE1C,OAAOA,gBAAgB,CAACC,IAAI,CAACF,IAAI,CAAC;AACpC","ignoreList":[]}
|
|
@@ -6,7 +6,7 @@ import removePunctuation from "../sanitize/removePunctuation.js";
|
|
|
6
6
|
import { unifyAllSpaces as unifyWhitespace } from "../sanitize/unifyWhitespace.js";
|
|
7
7
|
import matchStringWithTransliteration from "./matchTextWithTransliteration.js";
|
|
8
8
|
import { normalize as normalizeQuotes } from "../sanitize/quotes.js";
|
|
9
|
-
import { map } from
|
|
9
|
+
import { map } from 'lodash';
|
|
10
10
|
|
|
11
11
|
/**
|
|
12
12
|
* Returns the number of matches in a given string
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"matchTextWithWord.js","names":["stripSomeTags","stripSpaces","removePunctuation","unifyAllSpaces","unifyWhitespace","matchStringWithTransliteration","normalize","normalizeQuotes","map","text","wordToMatch","locale","matchWordCustomHelper","matches","keyword","positions","indexOf","count","length","position","Math","min"],"sources":["../../../../src/languageProcessing/helpers/match/matchTextWithWord.js"],"sourcesContent":["/** @module stringProcessing/matchTextWithWord */\n\nimport stripSomeTags from
|
|
1
|
+
{"version":3,"file":"matchTextWithWord.js","names":["stripSomeTags","stripSpaces","removePunctuation","unifyAllSpaces","unifyWhitespace","matchStringWithTransliteration","normalize","normalizeQuotes","map","text","wordToMatch","locale","matchWordCustomHelper","matches","keyword","positions","indexOf","count","length","position","Math","min"],"sources":["../../../../src/languageProcessing/helpers/match/matchTextWithWord.js"],"sourcesContent":["/** @module stringProcessing/matchTextWithWord */\n\nimport stripSomeTags from '../sanitize/stripNonTextTags.js';\nimport stripSpaces from '../sanitize/stripSpaces.js';\nimport removePunctuation from '../sanitize/removePunctuation.js';\nimport {unifyAllSpaces as unifyWhitespace} from '../sanitize/unifyWhitespace.js';\nimport matchStringWithTransliteration from './matchTextWithTransliteration.js';\nimport {normalize as normalizeQuotes} from '../sanitize/quotes.js';\nimport {map} from 'lodash';\n\n/**\n * Returns the number of matches in a given string\n *\n * @param {string} text The text to use for matching the wordToMatch.\n * @param {string} wordToMatch The word to match in the text.\n * @param {string} locale \t\t\t\tThe locale used for transliteration.\n * @param {function} matchWordCustomHelper \tThe helper function to match word in text.\n *\n * @returns {Object} An array with all matches of the text, the number of the matches, and the lowest number of positions of the matches.\n */\nexport default function(text, wordToMatch, locale, matchWordCustomHelper) {\n text = stripSomeTags(text);\n text = unifyWhitespace(text);\n text = normalizeQuotes(text);\n\n wordToMatch = normalizeQuotes(wordToMatch);\n let matches = matchWordCustomHelper\n ? matchWordCustomHelper(text, wordToMatch)\n : matchStringWithTransliteration(text, wordToMatch, locale);\n matches = map(matches, function(keyword) {\n return stripSpaces(removePunctuation(keyword));\n });\n\n // Create an array of positions of matches to determine where in the text the wordToMatch occurred first.\n const positions = map(matches, function(keyword) {\n return text.indexOf(keyword);\n });\n\n return {\n count: matches.length,\n matches: matches,\n position: positions.length === 0 ? -1 : Math.min(...positions)\n };\n}\n"],"mappings":"AAAA;;AAEA,OAAOA,aAAa;AACpB,OAAOC,WAAW;AAClB,OAAOC,iBAAiB;AACxB,SAAQC,cAAc,IAAIC,eAAe;AACzC,OAAOC,8BAA8B;AACrC,SAAQC,SAAS,IAAIC,eAAe;AACpC,SAAQC,GAAG,QAAO,QAAQ;;AAE1B;AACA;AACA;AACA;AACA;AACA;AACA;AACA;AACA;AACA;AACA,eAAe,UAASC,IAAI,EAAEC,WAAW,EAAEC,MAAM,EAAEC,qBAAqB,EAAE;EACxEH,IAAI,GAAGT,aAAa,CAACS,IAAI,CAAC;EAC1BA,IAAI,GAAGL,eAAe,CAACK,IAAI,CAAC;EAC5BA,IAAI,GAAGF,eAAe,CAACE,IAAI,CAAC;EAE5BC,WAAW,GAAGH,eAAe,CAACG,WAAW,CAAC;EAC1C,IAAIG,OAAO,GAAGD,qBAAqB,GAC/BA,qBAAqB,CAACH,IAAI,EAAEC,WAAW,CAAC,GACxCL,8BAA8B,CAACI,IAAI,EAAEC,WAAW,EAAEC,MAAM,CAAC;EAC7DE,OAAO,GAAGL,GAAG,CAACK,OAAO,EAAE,UAASC,OAAO,EAAE;IACvC,OAAOb,WAAW,CAACC,iBAAiB,CAACY,OAAO,CAAC,CAAC;EAChD,CAAC,CAAC;;EAEF;EACA,MAAMC,SAAS,GAAGP,GAAG,CAACK,OAAO,EAAE,UAASC,OAAO,EAAE;IAC/C,OAAOL,IAAI,CAACO,OAAO,CAACF,OAAO,CAAC;EAC9B,CAAC,CAAC;EAEF,OAAO;IACLG,KAAK,EAAEJ,OAAO,CAACK,MAAM;IACrBL,OAAO,EAAEA,OAAO;IAChBM,QAAQ,EAAEJ,SAAS,CAACG,MAAM,KAAK,CAAC,GAAG,CAAC,CAAC,GAAGE,IAAI,CAACC,GAAG,CAAC,GAAGN,SAAS;EAC/D,CAAC;AACH","ignoreList":[]}
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
/** @module stringProcessing/countWords */
|
|
2
2
|
import sanitizeString from "../sanitize/sanitizeString";
|
|
3
|
-
import { filter, flatMap } from
|
|
3
|
+
import { filter, flatMap } from 'lodash';
|
|
4
4
|
import removePunctuation, { punctuationRegexString } from "../sanitize/removePunctuation.js";
|
|
5
|
-
const punctuationRegex = new RegExp(`([${punctuationRegexString}])`,
|
|
5
|
+
const punctuationRegex = new RegExp(`([${punctuationRegexString}])`, 'g');
|
|
6
6
|
|
|
7
7
|
/**
|
|
8
8
|
* Returns an array with words used in the text.
|
|
@@ -13,25 +13,33 @@ const punctuationRegex = new RegExp(`([${punctuationRegexString}])`, "g");
|
|
|
13
13
|
*
|
|
14
14
|
* @returns {Array} The array with all words.
|
|
15
15
|
*/
|
|
16
|
-
export default function (text, wordBoundaryRegexString =
|
|
16
|
+
export default function (text, wordBoundaryRegexString = '\\s', shouldRemovePunctuation = true) {
|
|
17
17
|
// Unify whitespaces and non-breaking spaces, remove table of content and strip the tags and multiple spaces.
|
|
18
18
|
text = sanitizeString(text);
|
|
19
|
-
if (text ===
|
|
19
|
+
if (text === '') {
|
|
20
20
|
return [];
|
|
21
21
|
}
|
|
22
|
-
const
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
22
|
+
const chineseCharRegex = /[\u4e00-\u9fff]/;
|
|
23
|
+
const hasChinese = chineseCharRegex.test(text);
|
|
24
|
+
const hasNoSpaces = !/\s/.test(text);
|
|
25
|
+
let words = [];
|
|
26
|
+
if (hasChinese && hasNoSpaces) {
|
|
27
|
+
words = Array.from(text).filter(char => chineseCharRegex.test(char));
|
|
26
28
|
} else {
|
|
27
|
-
|
|
28
|
-
words =
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
}
|
|
29
|
+
const wordBoundaryRegex = new RegExp(wordBoundaryRegexString, 'g');
|
|
30
|
+
words = text.split(wordBoundaryRegex);
|
|
31
|
+
if (shouldRemovePunctuation) {
|
|
32
|
+
words = words.map(removePunctuation);
|
|
33
|
+
} else {
|
|
34
|
+
// If punctuation is not removed, punctuation marks are tokenized as if they were words.
|
|
35
|
+
words = flatMap(words, word => {
|
|
36
|
+
const newWord = word.replace(punctuationRegex, ' $1 ');
|
|
37
|
+
return newWord.split(' ');
|
|
38
|
+
});
|
|
39
|
+
}
|
|
32
40
|
}
|
|
33
41
|
return filter(words, function (word) {
|
|
34
|
-
return word.trim() !==
|
|
42
|
+
return word.trim() !== '';
|
|
35
43
|
});
|
|
36
44
|
}
|
|
37
45
|
//# sourceMappingURL=getWords.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"getWords.js","names":["sanitizeString","filter","flatMap","removePunctuation","punctuationRegexString","punctuationRegex","RegExp","text","wordBoundaryRegexString","shouldRemovePunctuation","
|
|
1
|
+
{"version":3,"file":"getWords.js","names":["sanitizeString","filter","flatMap","removePunctuation","punctuationRegexString","punctuationRegex","RegExp","text","wordBoundaryRegexString","shouldRemovePunctuation","chineseCharRegex","hasChinese","test","hasNoSpaces","words","Array","from","char","wordBoundaryRegex","split","map","word","newWord","replace","trim"],"sources":["../../../../src/languageProcessing/helpers/word/getWords.js"],"sourcesContent":["/** @module stringProcessing/countWords */\nimport sanitizeString from '../sanitize/sanitizeString';\nimport {filter, flatMap} from 'lodash';\nimport removePunctuation, {punctuationRegexString} from '../sanitize/removePunctuation.js';\n\nconst punctuationRegex = new RegExp(`([${punctuationRegexString}])`, 'g');\n\n/**\n * Returns an array with words used in the text.\n *\n * @param {string} text The text to be counted.\n * @param {string} [wordBoundaryRegexString=\\\\s] The regex string for the word boundary that should be used to split the text into words.\n * @param {boolean} [shouldRemovePunctuation=true] If punctuation should be removed. Defaults to `true`.\n *\n * @returns {Array} The array with all words.\n */\nexport default function(text, wordBoundaryRegexString = '\\\\s', shouldRemovePunctuation = true) {\n // Unify whitespaces and non-breaking spaces, remove table of content and strip the tags and multiple spaces.\n text = sanitizeString(text);\n\n if (text === '') {\n return [];\n }\n\n const chineseCharRegex = /[\\u4e00-\\u9fff]/;\n const hasChinese = chineseCharRegex.test(text);\n const hasNoSpaces = !/\\s/.test(text);\n\n let words = [];\n\n if (hasChinese && hasNoSpaces) {\n words = Array.from(text).filter(char => chineseCharRegex.test(char));\n } else {\n const wordBoundaryRegex = new RegExp(wordBoundaryRegexString, 'g');\n\n words = text.split(wordBoundaryRegex);\n\n if (shouldRemovePunctuation) {\n words = words.map(removePunctuation);\n } else {\n // If punctuation is not removed, punctuation marks are tokenized as if they were words.\n words = flatMap(words, word => {\n const newWord = word.replace(punctuationRegex, ' $1 ');\n return newWord.split(' ');\n });\n }\n }\n\n return filter(words, function(word) {\n return word.trim() !== '';\n });\n}\n"],"mappings":"AAAA;AACA,OAAOA,cAAc;AACrB,SAAQC,MAAM,EAAEC,OAAO,QAAO,QAAQ;AACtC,OAAOC,iBAAiB,IAAGC,sBAAsB;AAEjD,MAAMC,gBAAgB,GAAG,IAAIC,MAAM,CAAC,KAAKF,sBAAsB,IAAI,EAAE,GAAG,CAAC;;AAEzE;AACA;AACA;AACA;AACA;AACA;AACA;AACA;AACA;AACA,eAAe,UAASG,IAAI,EAAEC,uBAAuB,GAAG,KAAK,EAAEC,uBAAuB,GAAG,IAAI,EAAE;EAC7F;EACAF,IAAI,GAAGP,cAAc,CAACO,IAAI,CAAC;EAE3B,IAAIA,IAAI,KAAK,EAAE,EAAE;IACf,OAAO,EAAE;EACX;EAEA,MAAMG,gBAAgB,GAAG,iBAAiB;EAC1C,MAAMC,UAAU,GAAGD,gBAAgB,CAACE,IAAI,CAACL,IAAI,CAAC;EAC9C,MAAMM,WAAW,GAAG,CAAC,IAAI,CAACD,IAAI,CAACL,IAAI,CAAC;EAEpC,IAAIO,KAAK,GAAG,EAAE;EAEd,IAAIH,UAAU,IAAIE,WAAW,EAAE;IAC7BC,KAAK,GAAGC,KAAK,CAACC,IAAI,CAACT,IAAI,CAAC,CAACN,MAAM,CAACgB,IAAI,IAAIP,gBAAgB,CAACE,IAAI,CAACK,IAAI,CAAC,CAAC;EACtE,CAAC,MAAM;IACL,MAAMC,iBAAiB,GAAG,IAAIZ,MAAM,CAACE,uBAAuB,EAAE,GAAG,CAAC;IAElEM,KAAK,GAAGP,IAAI,CAACY,KAAK,CAACD,iBAAiB,CAAC;IAErC,IAAIT,uBAAuB,EAAE;MAC3BK,KAAK,GAAGA,KAAK,CAACM,GAAG,CAACjB,iBAAiB,CAAC;IACtC,CAAC,MAAM;MACL;MACAW,KAAK,GAAGZ,OAAO,CAACY,KAAK,EAAEO,IAAI,IAAI;QAC7B,MAAMC,OAAO,GAAGD,IAAI,CAACE,OAAO,CAAClB,gBAAgB,EAAE,MAAM,CAAC;QACtD,OAAOiB,OAAO,CAACH,KAAK,CAAC,GAAG,CAAC;MAC3B,CAAC,CAAC;IACJ;EACF;EAEA,OAAOlB,MAAM,CAACa,KAAK,EAAE,UAASO,IAAI,EAAE;IAClC,OAAOA,IAAI,CAACG,IAAI,CAAC,CAAC,KAAK,EAAE;EAC3B,CAAC,CAAC;AACJ","ignoreList":[]}
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import { languageProcessing } from "../../../index.js";
|
|
2
|
+
const {
|
|
3
|
+
AbstractResearcher
|
|
4
|
+
} = languageProcessing;
|
|
5
|
+
|
|
6
|
+
// All helpers
|
|
7
|
+
import matchWordCustomHelper from "./helpers/matchTextWithWord";
|
|
8
|
+
import splitIntoTokensCustom from "./helpers/splitIntoTokensCustom";
|
|
9
|
+
import getSentencesCustom from "./helpers/getSentences";
|
|
10
|
+
|
|
11
|
+
// All config
|
|
12
|
+
import { all as functionWords } from "./config/functionWords";
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* The researcher contains all the researches, helpers, data, and config for Chinese language.
|
|
16
|
+
*/
|
|
17
|
+
export default class Researcher extends AbstractResearcher {
|
|
18
|
+
/**
|
|
19
|
+
* Constructor
|
|
20
|
+
* @param {Paper} paper The Paper object that is needed within the researches.
|
|
21
|
+
* @constructor
|
|
22
|
+
*/
|
|
23
|
+
constructor(paper) {
|
|
24
|
+
super(paper);
|
|
25
|
+
|
|
26
|
+
// Delete researches that are not suitable for Chinese
|
|
27
|
+
delete this.defaultResearches.getFleschReadingScore;
|
|
28
|
+
Object.assign(this.config, {
|
|
29
|
+
language: 'zh',
|
|
30
|
+
functionWords,
|
|
31
|
+
// Chinese doesn't use hyphens as word boundaries
|
|
32
|
+
areHyphensWordBoundaries: false
|
|
33
|
+
});
|
|
34
|
+
Object.assign(this.helpers, {
|
|
35
|
+
matchWordCustomHelper,
|
|
36
|
+
splitIntoTokensCustom,
|
|
37
|
+
getSentencesCustom
|
|
38
|
+
});
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
//# sourceMappingURL=Researcher.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"Researcher.js","names":["languageProcessing","AbstractResearcher","matchWordCustomHelper","splitIntoTokensCustom","getSentencesCustom","all","functionWords","Researcher","constructor","paper","defaultResearches","getFleschReadingScore","Object","assign","config","language","areHyphensWordBoundaries","helpers"],"sources":["../../../../src/languageProcessing/languages/zh/Researcher.js"],"sourcesContent":["import {languageProcessing} from '@axyseo/index.js';\nconst {AbstractResearcher} = languageProcessing;\n\n// All helpers\nimport matchWordCustomHelper from './helpers/matchTextWithWord';\nimport splitIntoTokensCustom from './helpers/splitIntoTokensCustom';\nimport getSentencesCustom from './helpers/getSentences';\n\n// All config\nimport {all as functionWords} from './config/functionWords';\n\n/**\n * The researcher contains all the researches, helpers, data, and config for Chinese language.\n */\nexport default class Researcher extends AbstractResearcher {\n /**\n * Constructor\n * @param {Paper} paper The Paper object that is needed within the researches.\n * @constructor\n */\n constructor(paper) {\n super(paper);\n\n // Delete researches that are not suitable for Chinese\n delete this.defaultResearches.getFleschReadingScore;\n\n Object.assign(this.config, {\n language: 'zh',\n functionWords,\n // Chinese doesn't use hyphens as word boundaries\n areHyphensWordBoundaries: false\n });\n\n Object.assign(this.helpers, {\n matchWordCustomHelper,\n splitIntoTokensCustom,\n getSentencesCustom\n });\n }\n}\n"],"mappings":"AAAA,SAAQA,kBAAkB;AAC1B,MAAM;EAACC;AAAkB,CAAC,GAAGD,kBAAkB;;AAE/C;AACA,OAAOE,qBAAqB;AAC5B,OAAOC,qBAAqB;AAC5B,OAAOC,kBAAkB;;AAEzB;AACA,SAAQC,GAAG,IAAIC,aAAa;;AAE5B;AACA;AACA;AACA,eAAe,MAAMC,UAAU,SAASN,kBAAkB,CAAC;EACzD;AACF;AACA;AACA;AACA;EACEO,WAAWA,CAACC,KAAK,EAAE;IACjB,KAAK,CAACA,KAAK,CAAC;;IAEZ;IACA,OAAO,IAAI,CAACC,iBAAiB,CAACC,qBAAqB;IAEnDC,MAAM,CAACC,MAAM,CAAC,IAAI,CAACC,MAAM,EAAE;MACzBC,QAAQ,EAAE,IAAI;MACdT,aAAa;MACb;MACAU,wBAAwB,EAAE;IAC5B,CAAC,CAAC;IAEFJ,MAAM,CAACC,MAAM,CAAC,IAAI,CAACI,OAAO,EAAE;MAC1Bf,qBAAqB;MACrBC,qBAAqB;MACrBC;IACF,CAAC,CAAC;EACJ;AACF","ignoreList":[]}
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Returns an object with Chinese function words.
|
|
3
|
+
*
|
|
4
|
+
* @returns {Array} The array of Chinese function words.
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
// Articles and determiners
|
|
8
|
+
const articles = ['这', '那', '这个', '那个', '这些', '那些', '一个', '一些'];
|
|
9
|
+
|
|
10
|
+
// Pronouns
|
|
11
|
+
const pronouns = [
|
|
12
|
+
// Personal pronouns
|
|
13
|
+
'我', '你', '他', '她', '它', '我们', '你们', '他们', '她们', '它们',
|
|
14
|
+
// Possessive pronouns
|
|
15
|
+
'我的', '你的', '他的', '她的', '它的', '我们的', '你们的', '他们的', '她们的', '它们的',
|
|
16
|
+
// Demonstrative pronouns
|
|
17
|
+
'这', '那', '这个', '那个', '这些', '那些', '这里', '那里', '这儿', '那儿'];
|
|
18
|
+
|
|
19
|
+
// Prepositions
|
|
20
|
+
const prepositions = ['在', '从', '到', '向', '朝', '往', '由', '被', '把', '对', '为', '给', '跟', '和', '与', '同', '关于', '按照', '根据'];
|
|
21
|
+
|
|
22
|
+
// Conjunctions
|
|
23
|
+
const conjunctions = ['和', '或', '但', '但是', '可是', '然而', '不过', '而且', '并且', '以及', '还有', '或者', '要么', '因为', '所以', '如果', '要是', '虽然', '尽管'];
|
|
24
|
+
|
|
25
|
+
// Auxiliary verbs and modal verbs
|
|
26
|
+
const auxiliaries = ['是', '有', '没', '没有', '能', '可以', '会', '要', '想', '应该', '必须', '可能', '或许', '也许', '将', '将要', '正在', '已经', '刚刚'];
|
|
27
|
+
|
|
28
|
+
// Quantifiers and numbers
|
|
29
|
+
const quantifiers = ['一', '二', '三', '四', '五', '六', '七', '八', '九', '十', '百', '千', '万', '亿', '个', '只', '条', '本', '张', '件', '台', '辆', '所', '座', '栋', '间', '层', '些', '点', '种', '样', '次', '遍', '回'];
|
|
30
|
+
|
|
31
|
+
// Adverbs
|
|
32
|
+
const adverbs = ['很', '非常', '特别', '十分', '极', '太', '更', '最', '还', '也', '都', '只', '就', '才', '又', '再', '还是', '总是', '经常', '有时', '偶尔', '从来', '永远'];
|
|
33
|
+
|
|
34
|
+
// Particles
|
|
35
|
+
const particles = ['的', '地', '得', '了', '着', '过', '呢', '吗', '吧', '啊', '呀', '哦', '嗯', '哪', '什么', '怎么', '为什么', '哪里', '哪儿', '怎样'];
|
|
36
|
+
|
|
37
|
+
// Interjections
|
|
38
|
+
const interjections = ['哦', '啊', '呀', '哇', '嗯', '嘿', '喂', '哎', '唉', '咦', '哟', '呵', '嘻', '哈'];
|
|
39
|
+
export const all = [].concat(articles, pronouns, prepositions, conjunctions, auxiliaries, quantifiers, adverbs, particles, interjections);
|
|
40
|
+
//# sourceMappingURL=functionWords.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"functionWords.js","names":["articles","pronouns","prepositions","conjunctions","auxiliaries","quantifiers","adverbs","particles","interjections","all","concat"],"sources":["../../../../../src/languageProcessing/languages/zh/config/functionWords.js"],"sourcesContent":["/**\n * Returns an object with Chinese function words.\n *\n * @returns {Array} The array of Chinese function words.\n */\n\n// Articles and determiners\nconst articles = ['这', '那', '这个', '那个', '这些', '那些', '一个', '一些'];\n\n// Pronouns\nconst pronouns = [\n // Personal pronouns\n '我',\n '你',\n '他',\n '她',\n '它',\n '我们',\n '你们',\n '他们',\n '她们',\n '它们',\n // Possessive pronouns\n '我的',\n '你的',\n '他的',\n '她的',\n '它的',\n '我们的',\n '你们的',\n '他们的',\n '她们的',\n '它们的',\n // Demonstrative pronouns\n '这',\n '那',\n '这个',\n '那个',\n '这些',\n '那些',\n '这里',\n '那里',\n '这儿',\n '那儿'\n];\n\n// Prepositions\nconst prepositions = [\n '在',\n '从',\n '到',\n '向',\n '朝',\n '往',\n '由',\n '被',\n '把',\n '对',\n '为',\n '给',\n '跟',\n '和',\n '与',\n '同',\n '关于',\n '按照',\n '根据'\n];\n\n// Conjunctions\nconst conjunctions = [\n '和',\n '或',\n '但',\n '但是',\n '可是',\n '然而',\n '不过',\n '而且',\n '并且',\n '以及',\n '还有',\n '或者',\n '要么',\n '因为',\n '所以',\n '如果',\n '要是',\n '虽然',\n '尽管'\n];\n\n// Auxiliary verbs and modal verbs\nconst auxiliaries = [\n '是',\n '有',\n '没',\n '没有',\n '能',\n '可以',\n '会',\n '要',\n '想',\n '应该',\n '必须',\n '可能',\n '或许',\n '也许',\n '将',\n '将要',\n '正在',\n '已经',\n '刚刚'\n];\n\n// Quantifiers and numbers\nconst quantifiers = [\n '一',\n '二',\n '三',\n '四',\n '五',\n '六',\n '七',\n '八',\n '九',\n '十',\n '百',\n '千',\n '万',\n '亿',\n '个',\n '只',\n '条',\n '本',\n '张',\n '件',\n '台',\n '辆',\n '所',\n '座',\n '栋',\n '间',\n '层',\n '些',\n '点',\n '种',\n '样',\n '次',\n '遍',\n '回'\n];\n\n// Adverbs\nconst adverbs = [\n '很',\n '非常',\n '特别',\n '十分',\n '极',\n '太',\n '更',\n '最',\n '还',\n '也',\n '都',\n '只',\n '就',\n '才',\n '又',\n '再',\n '还是',\n '总是',\n '经常',\n '有时',\n '偶尔',\n '从来',\n '永远'\n];\n\n// Particles\nconst particles = [\n '的',\n '地',\n '得',\n '了',\n '着',\n '过',\n '呢',\n '吗',\n '吧',\n '啊',\n '呀',\n '哦',\n '嗯',\n '哪',\n '什么',\n '怎么',\n '为什么',\n '哪里',\n '哪儿',\n '怎样'\n];\n\n// Interjections\nconst interjections = [\n '哦',\n '啊',\n '呀',\n '哇',\n '嗯',\n '嘿',\n '喂',\n '哎',\n '唉',\n '咦',\n '哟',\n '呵',\n '嘻',\n '哈'\n];\n\nexport const all = [].concat(\n articles,\n pronouns,\n prepositions,\n conjunctions,\n auxiliaries,\n quantifiers,\n adverbs,\n particles,\n interjections\n);\n"],"mappings":"AAAA;AACA;AACA;AACA;AACA;;AAEA;AACA,MAAMA,QAAQ,GAAG,CAAC,GAAG,EAAE,GAAG,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,CAAC;;AAE/D;AACA,MAAMC,QAAQ,GAAG;AACf;AACA,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,IAAI;AACJ;AACA,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,KAAK,EACL,KAAK,EACL,KAAK,EACL,KAAK,EACL,KAAK;AACL;AACA,GAAG,EACH,GAAG,EACH,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,IAAI,CACL;;AAED;AACA,MAAMC,YAAY,GAAG,CACnB,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,IAAI,EACJ,IAAI,EACJ,IAAI,CACL;;AAED;AACA,MAAMC,YAAY,GAAG,CACnB,GAAG,EACH,GAAG,EACH,GAAG,EACH,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,IAAI,CACL;;AAED;AACA,MAAMC,WAAW,GAAG,CAClB,GAAG,EACH,GAAG,EACH,GAAG,EACH,IAAI,EACJ,GAAG,EACH,IAAI,EACJ,GAAG,EACH,GAAG,EACH,GAAG,EACH,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,GAAG,EACH,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,IAAI,CACL;;AAED;AACA,MAAMC,WAAW,GAAG,CAClB,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,CACJ;;AAED;AACA,MAAMC,OAAO,GAAG,CACd,GAAG,EACH,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,IAAI,EACJ,IAAI,CACL;;AAED;AACA,MAAMC,SAAS,GAAG,CAChB,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,IAAI,EACJ,IAAI,EACJ,KAAK,EACL,IAAI,EACJ,IAAI,EACJ,IAAI,CACL;;AAED;AACA,MAAMC,aAAa,GAAG,CACpB,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,EACH,GAAG,CACJ;AAED,OAAO,MAAMC,GAAG,GAAG,EAAE,CAACC,MAAM,CAC1BV,QAAQ,EACRC,QAAQ,EACRC,YAAY,EACZC,YAAY,EACZC,WAAW,EACXC,WAAW,EACXC,OAAO,EACPC,SAAS,EACTC,aACF,CAAC","ignoreList":[]}
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Gets sentences from Chinese text using Chinese punctuation marks.
|
|
3
|
+
*
|
|
4
|
+
* @param {string} text The text to get sentences from.
|
|
5
|
+
*
|
|
6
|
+
* @returns {string[]} An array of sentences.
|
|
7
|
+
*/
|
|
8
|
+
export default function getSentences(text) {
|
|
9
|
+
if (!text) {
|
|
10
|
+
return [];
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
// Chinese sentence-ending punctuation marks
|
|
14
|
+
const chineseSentenceEnders = /[。!?;]/;
|
|
15
|
+
const englishSentenceEnders = /[.!?;]/;
|
|
16
|
+
|
|
17
|
+
// Split by Chinese and English sentence enders, keeping the punctuation
|
|
18
|
+
const sentences = text.split(/([。!?;.!?;])/).filter(sentence => sentence.trim() !== '');
|
|
19
|
+
const result = [];
|
|
20
|
+
let currentSentence = '';
|
|
21
|
+
for (let i = 0; i < sentences.length; i++) {
|
|
22
|
+
const part = sentences[i];
|
|
23
|
+
if (chineseSentenceEnders.test(part) || englishSentenceEnders.test(part)) {
|
|
24
|
+
// This is punctuation - add to current sentence and finalize it
|
|
25
|
+
currentSentence += part;
|
|
26
|
+
if (currentSentence.trim()) {
|
|
27
|
+
result.push(currentSentence.trim());
|
|
28
|
+
}
|
|
29
|
+
currentSentence = '';
|
|
30
|
+
} else {
|
|
31
|
+
// This is sentence content
|
|
32
|
+
currentSentence += part;
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
// Add any remaining sentence
|
|
37
|
+
if (currentSentence.trim()) {
|
|
38
|
+
result.push(currentSentence.trim());
|
|
39
|
+
}
|
|
40
|
+
return result.filter(sentence => sentence.length > 0);
|
|
41
|
+
}
|
|
42
|
+
//# sourceMappingURL=getSentences.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"getSentences.js","names":["getSentences","text","chineseSentenceEnders","englishSentenceEnders","sentences","split","filter","sentence","trim","result","currentSentence","i","length","part","test","push"],"sources":["../../../../../src/languageProcessing/languages/zh/helpers/getSentences.js"],"sourcesContent":["/**\n * Gets sentences from Chinese text using Chinese punctuation marks.\n *\n * @param {string} text The text to get sentences from.\n *\n * @returns {string[]} An array of sentences.\n */\nexport default function getSentences(text) {\n if (!text) {\n return [];\n }\n\n // Chinese sentence-ending punctuation marks\n const chineseSentenceEnders = /[。!?;]/;\n const englishSentenceEnders = /[.!?;]/;\n\n // Split by Chinese and English sentence enders, keeping the punctuation\n const sentences = text.split(/([。!?;.!?;])/).filter(sentence => sentence.trim() !== '');\n\n const result = [];\n let currentSentence = '';\n\n for (let i = 0; i < sentences.length; i++) {\n const part = sentences[i];\n\n if (chineseSentenceEnders.test(part) || englishSentenceEnders.test(part)) {\n // This is punctuation - add to current sentence and finalize it\n currentSentence += part;\n if (currentSentence.trim()) {\n result.push(currentSentence.trim());\n }\n currentSentence = '';\n } else {\n // This is sentence content\n currentSentence += part;\n }\n }\n\n // Add any remaining sentence\n if (currentSentence.trim()) {\n result.push(currentSentence.trim());\n }\n\n return result.filter(sentence => sentence.length > 0);\n}\n"],"mappings":"AAAA;AACA;AACA;AACA;AACA;AACA;AACA;AACA,eAAe,SAASA,YAAYA,CAACC,IAAI,EAAE;EACzC,IAAI,CAACA,IAAI,EAAE;IACT,OAAO,EAAE;EACX;;EAEA;EACA,MAAMC,qBAAqB,GAAG,QAAQ;EACtC,MAAMC,qBAAqB,GAAG,QAAQ;;EAEtC;EACA,MAAMC,SAAS,GAAGH,IAAI,CAACI,KAAK,CAAC,cAAc,CAAC,CAACC,MAAM,CAACC,QAAQ,IAAIA,QAAQ,CAACC,IAAI,CAAC,CAAC,KAAK,EAAE,CAAC;EAEvF,MAAMC,MAAM,GAAG,EAAE;EACjB,IAAIC,eAAe,GAAG,EAAE;EAExB,KAAK,IAAIC,CAAC,GAAG,CAAC,EAAEA,CAAC,GAAGP,SAAS,CAACQ,MAAM,EAAED,CAAC,EAAE,EAAE;IACzC,MAAME,IAAI,GAAGT,SAAS,CAACO,CAAC,CAAC;IAEzB,IAAIT,qBAAqB,CAACY,IAAI,CAACD,IAAI,CAAC,IAAIV,qBAAqB,CAACW,IAAI,CAACD,IAAI,CAAC,EAAE;MACxE;MACAH,eAAe,IAAIG,IAAI;MACvB,IAAIH,eAAe,CAACF,IAAI,CAAC,CAAC,EAAE;QAC1BC,MAAM,CAACM,IAAI,CAACL,eAAe,CAACF,IAAI,CAAC,CAAC,CAAC;MACrC;MACAE,eAAe,GAAG,EAAE;IACtB,CAAC,MAAM;MACL;MACAA,eAAe,IAAIG,IAAI;IACzB;EACF;;EAEA;EACA,IAAIH,eAAe,CAACF,IAAI,CAAC,CAAC,EAAE;IAC1BC,MAAM,CAACM,IAAI,CAACL,eAAe,CAACF,IAAI,CAAC,CAAC,CAAC;EACrC;EAEA,OAAOC,MAAM,CAACH,MAAM,CAACC,QAAQ,IAAIA,QAAQ,CAACK,MAAM,GAAG,CAAC,CAAC;AACvD","ignoreList":[]}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Matches a word in a text for Chinese language.
|
|
3
|
+
* Chinese doesn't use spaces between words, so we use exact string matching.
|
|
4
|
+
* This function is compatible with the matchWordCustomHelper interface.
|
|
5
|
+
*
|
|
6
|
+
* @param {string|Object} sentence The sentence to search in (can be string or sentence object).
|
|
7
|
+
* @param {string} word The word to search for.
|
|
8
|
+
*
|
|
9
|
+
* @returns {Array} An array of matches found in the text.
|
|
10
|
+
*/
|
|
11
|
+
export default function matchTextWithWord(sentence, word) {
|
|
12
|
+
const matches = [];
|
|
13
|
+
|
|
14
|
+
// Handle both string and sentence object
|
|
15
|
+
const text = typeof sentence === 'string' ? sentence : sentence.text || sentence;
|
|
16
|
+
if (!text || !word) {
|
|
17
|
+
return matches;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
// Convert both text and word to lowercase for case-insensitive matching
|
|
21
|
+
const lowerText = text.toLowerCase();
|
|
22
|
+
const lowerWord = word.toLowerCase();
|
|
23
|
+
let startIndex = 0;
|
|
24
|
+
let index;
|
|
25
|
+
|
|
26
|
+
// Find all occurrences of the word in the text
|
|
27
|
+
while ((index = lowerText.indexOf(lowerWord, startIndex)) !== -1) {
|
|
28
|
+
// For Chinese, we add the matched word directly as a string
|
|
29
|
+
// This matches the expected return format for the custom helper
|
|
30
|
+
matches.push(word);
|
|
31
|
+
startIndex = index + lowerWord.length;
|
|
32
|
+
}
|
|
33
|
+
return matches;
|
|
34
|
+
}
|
|
35
|
+
//# sourceMappingURL=matchTextWithWord.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"matchTextWithWord.js","names":["matchTextWithWord","sentence","word","matches","text","lowerText","toLowerCase","lowerWord","startIndex","index","indexOf","push","length"],"sources":["../../../../../src/languageProcessing/languages/zh/helpers/matchTextWithWord.js"],"sourcesContent":["/**\n * Matches a word in a text for Chinese language.\n * Chinese doesn't use spaces between words, so we use exact string matching.\n * This function is compatible with the matchWordCustomHelper interface.\n *\n * @param {string|Object} sentence The sentence to search in (can be string or sentence object).\n * @param {string} word The word to search for.\n *\n * @returns {Array} An array of matches found in the text.\n */\nexport default function matchTextWithWord(sentence, word) {\n const matches = [];\n\n // Handle both string and sentence object\n const text = typeof sentence === 'string' ? sentence : sentence.text || sentence;\n\n if (!text || !word) {\n return matches;\n }\n\n // Convert both text and word to lowercase for case-insensitive matching\n const lowerText = text.toLowerCase();\n const lowerWord = word.toLowerCase();\n\n let startIndex = 0;\n let index;\n\n // Find all occurrences of the word in the text\n while ((index = lowerText.indexOf(lowerWord, startIndex)) !== -1) {\n // For Chinese, we add the matched word directly as a string\n // This matches the expected return format for the custom helper\n matches.push(word);\n startIndex = index + lowerWord.length;\n }\n\n return matches;\n}\n"],"mappings":"AAAA;AACA;AACA;AACA;AACA;AACA;AACA;AACA;AACA;AACA;AACA,eAAe,SAASA,iBAAiBA,CAACC,QAAQ,EAAEC,IAAI,EAAE;EACxD,MAAMC,OAAO,GAAG,EAAE;;EAElB;EACA,MAAMC,IAAI,GAAG,OAAOH,QAAQ,KAAK,QAAQ,GAAGA,QAAQ,GAAGA,QAAQ,CAACG,IAAI,IAAIH,QAAQ;EAEhF,IAAI,CAACG,IAAI,IAAI,CAACF,IAAI,EAAE;IAClB,OAAOC,OAAO;EAChB;;EAEA;EACA,MAAME,SAAS,GAAGD,IAAI,CAACE,WAAW,CAAC,CAAC;EACpC,MAAMC,SAAS,GAAGL,IAAI,CAACI,WAAW,CAAC,CAAC;EAEpC,IAAIE,UAAU,GAAG,CAAC;EAClB,IAAIC,KAAK;;EAET;EACA,OAAO,CAACA,KAAK,GAAGJ,SAAS,CAACK,OAAO,CAACH,SAAS,EAAEC,UAAU,CAAC,MAAM,CAAC,CAAC,EAAE;IAChE;IACA;IACAL,OAAO,CAACQ,IAAI,CAACT,IAAI,CAAC;IAClBM,UAAU,GAAGC,KAAK,GAAGF,SAAS,CAACK,MAAM;EACvC;EAEA,OAAOT,OAAO;AAChB","ignoreList":[]}
|