resuml 1.12.0 → 1.12.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -762,7 +762,71 @@ var init_en = __esm({
762
762
  "process",
763
763
  "robust",
764
764
  "consistent",
765
- "operations"
765
+ "operations",
766
+ // URL/email/domain fragments
767
+ "http",
768
+ "https",
769
+ "www",
770
+ "com",
771
+ "org",
772
+ "net",
773
+ "mailto",
774
+ // Resume/YAML schema field names (in case raw YAML is pasted)
775
+ "name",
776
+ "keywords",
777
+ "highlights",
778
+ "startdate",
779
+ "enddate",
780
+ "website",
781
+ "profiles",
782
+ "basics",
783
+ "position",
784
+ "institution",
785
+ "studytype",
786
+ "fluency",
787
+ "issuer",
788
+ "network",
789
+ "username",
790
+ "countrycode",
791
+ "region",
792
+ // Generic nouns that aren't skills
793
+ "product",
794
+ "company",
795
+ "service",
796
+ "services",
797
+ "platform",
798
+ "solutions",
799
+ "ability",
800
+ "opportunity",
801
+ "candidate",
802
+ "applicant",
803
+ "position",
804
+ "salary",
805
+ "compensation",
806
+ "benefits",
807
+ "perks",
808
+ "bonus",
809
+ "development",
810
+ "management",
811
+ "knowledge",
812
+ "modern",
813
+ "advanced",
814
+ "practices",
815
+ "nice",
816
+ "technologies",
817
+ "technology",
818
+ "frameworks",
819
+ "framework",
820
+ "tools",
821
+ "data",
822
+ "based",
823
+ "contribute",
824
+ "contributions",
825
+ "migration",
826
+ "leading",
827
+ "source",
828
+ "visit",
829
+ "join"
766
830
  ]
767
831
  };
768
832
  en_default = en;
@@ -1333,8 +1397,20 @@ var init_genericChecks = __esm({
1333
1397
  });
1334
1398
 
1335
1399
  // src/ats/jdMatcher.ts
1400
+ function stripNoise(text) {
1401
+ return text.replace(/https?:\/\/[^\s]+/gi, " ").replace(/www\.[^\s]+/gi, " ").replace(/[\w.+-]+@[\w.-]+\.[a-z]{2,}/gi, " ").replace(/(?:^|\s)\/[\w/.-]+/g, " ").replace(/\b[a-z]+[A-Z][a-zA-Z]*\b/g, (match2) => {
1402
+ return match2.replace(/([a-z])([A-Z])/g, "$1 $2");
1403
+ });
1404
+ }
1336
1405
  function tokenize(text, stopWords) {
1337
- return text.toLowerCase().replace(/[^a-zA-Z0-9äöüßÄÖÜàáâãéèêëíìîïóòôõúùûüñç\s/+-]/g, " ").split(/\s+/).filter((word) => word.length > 2 && !stopWords.has(word));
1406
+ return text.toLowerCase().replace(/[^a-zA-Z0-9äöüßÄÖÜàáâãéèêëíìîïóòôõúùûüñç\s/+-]/g, " ").split(/\s+/).filter((word) => {
1407
+ if (word.length <= 2) return false;
1408
+ if (stopWords.has(word)) return false;
1409
+ if (word.startsWith("//") || word.startsWith("http")) return false;
1410
+ if (/^\d+$/.test(word)) return false;
1411
+ if (/^[/+-]+$/.test(word)) return false;
1412
+ return true;
1413
+ });
1338
1414
  }
1339
1415
  function simpleStem(word, language) {
1340
1416
  if (language === "de") {
@@ -1496,12 +1572,17 @@ function extractBrandNames(text) {
1496
1572
  function extractKeywords(text, language, maxKeywords = 30) {
1497
1573
  const langData = getLanguageData(language);
1498
1574
  const stopWords = new Set(langData.stopWords);
1499
- const compoundTerms = extractCompoundTerms(text);
1575
+ const cleanText = stripNoise(text);
1576
+ const compoundTerms = extractCompoundTerms(cleanText);
1500
1577
  const brandNames = extractBrandNames(text);
1501
- const { requirementText, otherText } = splitJdSections(text);
1502
- const reqTokens = tokenize(requirementText, stopWords).filter((t) => !brandNames.has(t));
1503
- const otherTokens = tokenize(otherText, stopWords).filter((t) => !brandNames.has(t));
1504
- const allTokens = [...reqTokens, ...reqTokens, ...reqTokens, ...otherTokens];
1578
+ const { requirementText } = splitJdSections(cleanText);
1579
+ const hasRequirementSections = requirementText.trim().length > 0;
1580
+ let allTokens;
1581
+ if (hasRequirementSections) {
1582
+ allTokens = tokenize(requirementText, stopWords).filter((t) => !brandNames.has(t));
1583
+ } else {
1584
+ allTokens = tokenize(cleanText, stopWords).filter((t) => !brandNames.has(t));
1585
+ }
1505
1586
  const stemmed = allTokens.map((t) => simpleStem(t, language));
1506
1587
  const tf = buildTfMap(stemmed);
1507
1588
  const stemToOriginal = /* @__PURE__ */ new Map();
@@ -1515,7 +1596,7 @@ function extractKeywords(text, language, maxKeywords = 30) {
1515
1596
  const compoundWordSet = new Set(compoundsFlat);
1516
1597
  const singleKeywords = [...tf.entries()].filter(([stem]) => stem.length > 2).filter(([stem]) => {
1517
1598
  const original = stemToOriginal.get(stem) || stem;
1518
- if (compoundWordSet.has(original) && !reqTokens.includes(original)) {
1599
+ if (compoundWordSet.has(original) && !allTokens.includes(original)) {
1519
1600
  return false;
1520
1601
  }
1521
1602
  return true;