npm - domain-rank - Versions diffs - 0.1.1 - Mend

domain-rank 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

package/exports/domain-rank.csv +10021 -0
package/exports/domain-rank.json +70142 -0
package/exports/domain-rank.ndjson +10020 -0
package/package.json +33 -0
package/packages/domain-rank/exports/domain-rank.csv +10021 -0
package/packages/domain-rank/exports/domain-rank.json +70142 -0
package/packages/domain-rank/exports/domain-rank.ndjson +10020 -0
package/readme.md +33 -0
package/src/domain-api.ts +79 -0
package/src/domain-exceptions.ts +24 -0
package/src/domain-name-formatter.ts +136 -0
package/src/duplicates.d.ts +3 -0
package/src/duplicates.js +413 -0
package/src/export.ts +98 -0
package/src/favicons.js +213 -0
package/src/import-domains-1m.js +170 -0
package/src/merge-domain-lists.ts +109 -0
package/src/parse-domain-info.ts +99 -0
package/test/domain.test.js +13 -0
package/test/search.test.js +360 -0
package/tsconfig.json +19 -0
package/vite.config.ts +18 -0

package/readme.md ADDED Viewed

@@ -0,0 +1,33 @@
+### [Live Demo](https://domain-rank.vercel.app)
+### Domain Info From Any URL
+* Domain Name: nytimes.com
+* Source Name: New York Times (from cache for top 1000 or parsing)
+* Influence Rank: 89 (from [Official Tranco List]( https://tranco-list.eu/))
+* Favicon: URL (with Google Favicons API) or base64 string
+### Domain Rank  Options:
+*  [Official Tranco List]( https://tranco-list.eu/)  aggregates multiple ranking providers (Cisco Umbrella, Majestic, Farsight, Chrome UX Report, [Cloudflare Domain Radar](https://radar.cloudflare.com/domains)) to generate manipulation-resistant popularity lists. The list is updated daily (UTC).
+* [CommonCrawl](https://commoncrawl.org) is nonprofit for open source public dataset that crawls and downloads the entire internet 100TB urls and html. It shows how trustworthy and influential a domain is based on links pointing to that domain's pages across all 120+ million domains.
+### Use Cases
+* Bookmark Lists and Web App Launcher
+* Autocomplete for search engine or URL bar, with typo-tolerance fuzzy
+* LLM Chatbot for Web App Recommendations
+### Exporter (CLI)
+Build the exporter with Vite and produce JSON/CSV/NDJSON exports from the bundled data.
+Run locally from the package root:
+```bash
+pnpm run build:export
+pnpm run export -- --formats=json,csv,ndjson --out=./exports
+```
+The command writes `domain-rank.json`, `domain-rank.csv`, and `domain-rank.ndjson` into `./exports` by default.

package/src/domain-api.ts ADDED Viewed

@@ -0,0 +1,79 @@
+import fs from 'fs';
+interface FaviconCache {
+  [domain: string]: string;
+}
+/**
+ * Get favicon for a URL or domain as base64 string using Google's favicon API
+ */
+export async function getFaviconForDomain(
+  urlOrDomain: string,
+  formatBase64: boolean = true
+): Promise<string> {
+  // Extract domain from URL or use as-is
+  const domain = isURLValid(urlOrDomain)
+    ? new URL(urlOrDomain.startsWith('http') ? urlOrDomain : 'https://' + urlOrDomain).hostname
+    : urlOrDomain;
+  // Check cache first - look for domain in domain-info.json
+  try {
+    const cachePath = './data/domain-info.json';
+    if (fs.existsSync(cachePath)) {
+      const cacheData = JSON.parse(fs.readFileSync(cachePath, 'utf8'));
+      if (cacheData[domain]) {
+        console.log(`Cache hit for domain: ${domain}`);
+        // Return cached favicon if available, or proceed with API call
+        // For now, we'll still make the API call but log the cache hit
+      }
+    }
+  } catch (error) {
+    console.warn(`Cache check failed for ${domain}:`, (error as Error).message);
+  }
+  const faviconURL = 'https://www.google.com/s2/favicons?domain=' + encodeURIComponent(domain);
+  const cacheFavicon: FaviconCache = JSON.parse(fs.readFileSync('./data/favicons.json', 'utf8'));
+  return formatBase64
+    ? cacheFavicon[domain] || Buffer.from(await (await fetch(faviconURL)).arrayBuffer()).toString('base64')
+    : faviconURL;
+}
+/**
+ * Extract TLD and hostname from domain in Regex. There's two or more part
+ * TLDs (https://en.wikipedia.org/wiki/List_of_Internet_top-level_domains)
+ * so it is hard to tell if host.secondTLD.tld or host.tld is correct way
+ * to get root domain (e.g. abc.go.jp, abc.co.uk)
+ */
+export function convertURLToDomain(domain: string): string {
+  const tldRegExp = new RegExp(
+    "(?=[^^]).(fr|de|cz|at|com|wiki|co|edu|gov|info|mil|id|" +
+    "gv|tv|int|name|net|org|pro|ac|me|ltd|parliament)(.|$).*$"
+  );
+  const match =
+    domain.match(tldRegExp) ||
+    domain.match(/(?=[^^])\.[^a-z]{1,2}\.[^\.]{2,4}$/) ||
+    domain.match(/\.[^\.]{2,}$/);
+  const tld = match && match.index;
+  let domainWithoutSuffix = domain.substring(0, tld);
+  // Get the main domain part, handling subdomains
+  if (domainWithoutSuffix.includes(".")) {
+    // Split by dots and get the last two parts for domains like en.wikipedia.org
+    const parts = domainWithoutSuffix.split(".");
+    if (parts.length >= 2) {
+      domainWithoutSuffix = parts.slice(-2).join(".");
+    } else {
+      domainWithoutSuffix = parts[parts.length - 1];
+    }
+  }
+  return domainWithoutSuffix;
+}
+/**
+ * Checks if a string is a valid URL.
+ */
+export function isURLValid(url: string): boolean {
+  return /^(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)*\/?$/
+    .test(url);
+}

package/src/domain-exceptions.ts ADDED Viewed

@@ -0,0 +1,24 @@
+export const domainExceptions: Record<string, string> = {
+  "pepsico.com": "PepsiCo",
+  "doc.gov": "U.S. Department of Commerce",
+  "wustl.edu": "Washington University in St. Louis",
+  "unwomen.org": "UN Women",
+  "gatech.edu": "Georgia Tech",
+  "instructure.com": "Instructure",
+  "usembassy.gov": "U.S. Embassy",
+  "digitaljournal.com": "Digital Journal",
+  "ico.org.uk": "Information Commissioner's Office",
+  "colorado.edu": "University of Colorado",
+  "anu.edu.au": "Australian National University",
+  "syr.edu": "Syracuse University",
+  "ucsb.edu": "UC Santa Barbara",
+  "imperial.ac.uk": "Imperial College London",
+  "grist.org": "Grist",
+  "iucn.org": "IUCN",
+  "corporatefinanceinstitute.com": "Corporate Finance Institute",
+  "aph.gov.au": "Australian Parliament House",
+  "reference.com": "Reference.com",
+  "timeshighereducation.com": "Times Higher Education",
+  "hopkinsmedicine.org": "Johns Hopkins Medicine",
+  "ustr.gov": "United States Trade Representative",
+};

package/src/domain-name-formatter.ts ADDED Viewed

@@ -0,0 +1,136 @@
+import { getDomainWithoutSuffix } from "tldts";
+import { duplicates, removals, titles } from "./duplicates.js";
+import { domainExceptions } from "./domain-exceptions.js";
+export function shouldRemoveDomain(domain: string): boolean {
+  return removals.some((r) => r.main === domain);
+}
+export function findMainDomain(domain: string): string | null {
+  for (const duplicate of duplicates) {
+    if (duplicate.alt && duplicate.alt.includes(domain)) {
+      return duplicate.main;
+    }
+  }
+  return null;
+}
+export function getTitleOverride(domain: string): string | null {
+  return titles[domain] || domainExceptions[domain] || null;
+}
+const STOP_WORDS = new Set([
+  "a", "an", "and", "are", "as", "at", "be", "by", "for", "from", "has",
+  "he", "in", "is", "it", "its", "of", "on", "that", "the", "to", "was",
+  "were", "will", "with",
+]);
+const DOMAIN_ENDINGS_RE = /\.(com|net|org|io|gov|edu|co\.uk)$/i;
+const COMMON_WORDS_RE =
+  /(post|the|insider|news|times|daily|weekly|herald|tribune|journal|gazette|press|star|sun|mail|today|now|live|tv|radio|web|net|tech|blog|online|digital|media|corp|inc|ltd|llc)/gi;
+export function formatDomainAsTitle(domain: string): string {
+  let source = getDomainWithoutSuffix(domain) ?? domain;
+  if (DOMAIN_ENDINGS_RE.test(source)) {
+    source = source.replace(DOMAIN_ENDINGS_RE, "");
+  }
+  source = source
+    .replace(/([a-z])([A-Z])/g, "$1 $2")
+    .replace(/([a-zA-Z])(\d)/g, "$1 $2")
+    .replace(/(\d)([a-zA-Z])/g, "$1 $2")
+    .replace(/([a-z])([A-Z][a-z])/g, "$1 $2")
+    .replace(new RegExp(`(${COMMON_WORDS_RE.source})([a-z])`, "gi"), "$1 $2")
+    .replace(new RegExp(`([a-z])(${COMMON_WORDS_RE.source})`, "gi"), "$1 $2")
+    .replace(/\s+/g, " ")
+    .replace(".com", "")
+    .replace(/home/gi, "")
+    .trim();
+  source = source
+    .split(" ")
+    .map((word) => {
+      const lower = word.toLowerCase();
+      return word.length <= 3 && !STOP_WORDS.has(lower)
+        ? word.toUpperCase()
+        : word.charAt(0).toUpperCase() + word.slice(1).toLowerCase();
+    })
+    .join(" ");
+  if (source.replace(/\s/g, "").length < 5) {
+    source = source.toUpperCase();
+  }
+  return source;
+}
+export function cleanSourceTitle(title: string): string | null {
+  if (!title) return null;
+  let cleaned = title.trim();
+  const TITLE_SPLITTERS_RE = /( [|\-\/:»] )|( - )|(\|)/;
+  if (TITLE_SPLITTERS_RE.test(cleaned)) {
+    const parts = cleaned.split(TITLE_SPLITTERS_RE);
+    const longest = parts.reduce(
+      (acc, part) => ((part?.length ?? 0) > (acc?.length ?? 0) ? part : acc),
+      ""
+    );
+    if (longest.length > 10) cleaned = longest;
+  }
+  const SUFFIXES = [
+    " - Home", " | Home", " - Official Site", " | Official Site",
+    " - Official Website", " | Official Website", " - Official", " | Official",
+    " - Welcome", " | Welcome", " - Homepage", " | Homepage",
+  ];
+  for (const suffix of SUFFIXES) {
+    if (cleaned.endsWith(suffix)) cleaned = cleaned.slice(0, -suffix.length);
+  }
+  if (cleaned.length > 150) cleaned = cleaned.substring(0, 150);
+  cleaned = cleaned
+    .replace(/<\/?[^>]+(>|$)/g, "")
+    .replace(/\s+/g, " ")
+    .trim();
+  return cleaned.length > 0 ? cleaned : null;
+}
+export async function getSourceTitle(domain: string): Promise<string | null> {
+  try {
+    const controller = new AbortController();
+    const timeoutId = setTimeout(() => controller.abort(), 5000);
+    const response = await fetch(`https://${domain}`, {
+      headers: {
+        "User-Agent":
+          "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
+      },
+      signal: controller.signal,
+    });
+    clearTimeout(timeoutId);
+    if (!response.ok) return null;
+    const html = await response.text();
+    const ogTitle = html.match(
+      /<meta[^>]*property=["']og:title["'][^>]*content=["']([^"']+)["'][^>]*>/i
+    );
+    if (ogTitle) return ogTitle[1].trim();
+    const title = html.match(/<title[^>]*>([^<]+)<\/title>/i);
+    if (title) return title[1].trim();
+    return null;
+  } catch (error) {
+    console.log(
+      `Could not get source title for ${domain}: ${(error as Error).message}`
+    );
+    return null;
+  }
+}

package/src/duplicates.d.ts ADDED Viewed

@@ -0,0 +1,3 @@
+export declare const titles: Record<string, string>;
+export declare const removals: Array<{ main: string; remove?: boolean }>;
+export declare const duplicates: Array<{ main: string; alt?: string[] }>;

package/src/duplicates.js ADDED Viewed

@@ -0,0 +1,413 @@
+/**
+ * Domain name duplicate alternatives found in top domains
+ * Infrastructure tools are to be removed
+ * Alternative domains are to be listed under main.
+ */
+export const titles = {
+  "state.tx.us":"Texas State Government",
+  "ufl.edu": "University of Florida",
+  "admin.ch": " Swiss Government",
+  "tufts.edu": "Tufts University",
+"usgs.gov":  "U.S. Geological Survey",
+  "theglobeandmail.com": "The Globe and Mail",
+  "spiegel.de": "Spiegel German News",
+  "justice.gov": "US Department of Justice",
+  "gov.ru": "Russian Government",
+  "martinfowler.com": "Martin Fowler",
+  "optimizely.com": "Optimizely",
+  "loc.gov": "Library of Congress",
+  "uci.edu": "UC Irvine",
+  "ca.gov": "California Government",
+  "cam.ac.uk": "Cambridge University",
+  "baidu.com": "Baidu",
+  "tandfonline.com": "Taylor & Francis Journals",
+  "ohchr.org": "UN Human Rights Office",
+  "giphy.com": "Giphy",
+  "naver.com": "Naver (네이버)",
+  "upenn.edu": "University of Pennsylvania",
+  "ft.com": "Financial Times",
+  "berkeley.edu": "UC Berkeley",
+  "kickstarter.com": "Kickstarter",
+  "doi.org": "Digital Object Identifier",
+  "stackoverflow.com": "StackOverflow",
+  "businessinsider.com": "Business Insider",
+  "ietf.org":  "Internet Engineering Task Force",
+  "qq.com":"QQ (腾讯网)",
+  "linktr.ee": "Link Tree",
+  "vk.com": "VKontakte",
+  "bit.ly": "Bitly",
+  "europa.eu": "European Union",
+  "newyorker.com": "New Yorker",
+  "ameblo.jp": "Ameba Japanese",
+  "gov.uk": "UK Government",
+  "gov.ru": "Russian Government",
+  "gov.ca": "Canadian Government",
+  "gov.au": "Australian Government",
+  "gov.nz": "New Zealand Government",
+  "gov.in": "Indian Government",
+  "gov.cn": "Chinese Government",
+}
+export const removals  = [
+  { main: "polyfill.io", remove: true },
+  { main: "example.com", remove: true },
+  { main: "example.net", remove: true },
+  { main: "domain.com", remove: true },
+  { main: "jsdelivr.net", remove: true },
+  { main: "gmpg.org", remove: true },
+  { main: "aka.ms", remove: true },
+  { main: "addthis.com", remove: true },
+  { main: "doubleclick.net", remove: true },
+  { main: "netdna-ssl.com", remove: true },
+]
+export const duplicates  = [
+  {
+    main: "godaddy.com",
+    alt: ["wsimg.com"],
+  },
+  {
+    main: "blogspot.com",
+    alt: ["blogger.com", "blogspot.ca", "blogspot.co.uk"],
+  },
+  {
+    main: "microsoft.com",
+    alt: ["windows.net"],
+  },
+  {
+    main: "huffingtonpost.com",
+    alt: ["huffpost.com"],
+  },
+  {
+    main: "sciencemag.org",
+    alt: ["science.org"],
+  },
+  {
+    main: "theguardian.com",
+    alt: ["guardian.co.uk"],
+  },
+  {
+    main: "bootstrap.com",
+    alt: ["bootstrapcdn.com"],
+  },
+  {
+    main: "telegram.me",
+    alt: ["telegram.org", "telegra.ph", "t.me"],
+  },
+  {
+    main: "netlify.com",
+    alt: ["netlify.app"],
+  },
+  {
+    main: "apple.com",
+    alt: ["apple.co"],
+  },
+  {
+    main: "pinterest.com",
+    alt: ["pinterest.co.uk", "pinimg.com"],
+  },
+  {
+    main: "bbc.co.uk",
+    alt: ["bbci.co.uk", "bbc.com"],
+  },
+  {
+    main: "yandex.ru",
+    alt: ["yandex.net"],
+  },
+  {
+    main: "pewresearch.org",
+    alt: ["pewsocialtrends.org"],
+  },
+  {
+    main: "jenkins.io",
+    alt: ["jenkins-ci.org"],
+  },
+  {
+    main: "wix.com",
+    alt: ["wixstatic.com", "wixsite.com"],
+  },
+  {
+    main: "wordpress.com",
+    alt: ["wp.me", "wordpress.org", "wp.com", "w.org", "gravatar.com", "wpengine.com"],
+  },
+  {
+    main: "amazon.com",
+    alt: [
+      "amazon.es",
+        "amazon.in",
+        "amazon.it",
+        "amazon.de",
+        "amazon.fr",
+      "alexa.com",
+      "amazon.cn",
+      "amazon.co.jp",
+      "amazon.com.au",
+      "amazon.com.br",
+      "amazon.com.cn",
+      "amazon.com.hk",
+      "amazon.com.mx",
+      "amazon.com.sg",
+      "amazon.com.tw",
+      "amazon.com.mx",
+      "cloudfront.net",
+      "amzn.to",
+      "a.co",
+      "amazon.fr",
+      "amazon.de",
+      "amazon.co.uk",
+      "amazonaws.com",
+      "media-amazon.com",
+    ],
+  },
+  {
+    main: "mailchimp.com",
+    alt: ["mailchi.mp", "list-manage.com", "eepurl.com"],
+  },
+  {
+    main: "flickr.com",
+    alt: ["staticflickr.com"],
+  },
+  {
+    main: "facebook.com",
+    alt: ["fb.watch", "facebook.net", "fb.me", "fb.com", "fbcdn.net"],
+  },
+  {
+    main: "whatapp.com",
+    alt: ["wa.me"],
+  },
+  {
+    main: "thetimes.com",
+    alt: ["timesonline.co.uk", "thetimes.co.uk"],
+  },
+  {
+    main: "youtube.com",
+    alt: ["youtu.be", "youtube-nocookie.com", "ytimg.com"],
+  },
+  {
+    main: "imgbb.com",
+    alt: ["ibb.com", "ibb.co"],
+  },
+  {
+    main: "x.com",
+    alt: ["twitter.com", "t.co", "twimg.com"],
+  },
+  {
+    main: "npmjs.com",
+    alt: ["unpkg.com"],
+  },
+  {
+    main: "github.com",
+    alt: ["githubusercontent.com", "github.io", "git-scm.com"],
+  },
+  {
+    main: "google.com",
+    alt: [
+      "googleapis.com",
+      "g.page",
+      "business.site",
+      "google-analytics.com",
+      "goo.gl",
+      "withgoogle.com",
+      "ggpht.com",
+      "research.google",
+      "g.co",
+      "translate.goog",
+      "googleblog.com",
+      "blog.google",
+      "forms.gle",
+      "googletagmanager.com",
+      "googleadservices.com",
+      "googleusercontent.com",
+      "googlesyndication.com",
+      "gstatic.com",
+      "google.ad",
+      "google.ae",
+      "google.com.af",
+      "google.com.ag",
+      "google.al",
+      "google.am",
+      "google.co.ao",
+      "google.com.ar",
+      "google.as",
+      "google.at",
+      "google.com.au",
+      "google.az",
+      "google.ba",
+      "google.com.bd",
+      "google.be",
+      "google.bf",
+      "google.bg",
+      "google.com.bh",
+      "google.bi",
+      "google.bj",
+      "google.com.bn",
+      "google.com.bo",
+      "google.com.br",
+      "google.bs",
+      "google.bt",
+      "google.co.bw",
+      "google.by",
+      "google.com.bz",
+      "google.ca",
+      "google.cd",
+      "google.cf",
+      "google.cg",
+      "google.ch",
+      "google.ci",
+      "google.co.ck",
+      "google.cl",
+      "google.cm",
+      "google.cn",
+      "google.com.co",
+      "google.co.cr",
+      "google.com.cu",
+      "google.cv",
+      "google.com.cy",
+      "google.cz",
+      "google.de",
+      "google.dj",
+      "google.dk",
+      "google.dm",
+      "google.com.do",
+      "google.dz",
+      "google.com.ec",
+      "google.ee",
+      "google.com.eg",
+      "google.es",
+      "google.com.et",
+      "google.fi",
+      "google.com.fj",
+      "google.fm",
+      "google.fr",
+      "google.ga",
+      "google.ge",
+      "google.gg",
+      "google.com.gh",
+      "google.com.gi",
+      "google.gl",
+      "google.gm",
+      "google.gr",
+      "google.com.gt",
+      "google.gy",
+      "google.com.hk",
+      "google.hn",
+      "google.hr",
+      "google.ht",
+      "google.hu",
+      "google.co.id",
+      "google.ie",
+      "google.co.il",
+      "google.im",
+      "google.co.in",
+      "google.iq",
+      "google.is",
+      "google.it",
+      "google.je",
+      "google.com.jm",
+      "google.jo",
+      "google.co.jp",
+      "google.co.ke",
+      "google.com.kh",
+      "google.ki",
+      "google.kg",
+      "google.co.kr",
+      "google.com.kw",
+      "google.kz",
+      "google.la",
+      "google.com.lb",
+      "google.li",
+      "google.lk",
+      "google.co.ls",
+      "google.lt",
+      "google.lu",
+      "google.lv",
+      "google.com.ly",
+      "google.co.ma",
+      "google.md",
+      "google.me",
+      "google.mg",
+      "google.mk",
+      "google.ml",
+      "google.com.mm",
+      "google.mn",
+      "google.com.mt",
+      "google.mu",
+      "google.mv",
+      "google.mw",
+      "google.com.mx",
+      "google.com.my",
+      "google.co.mz",
+      "google.com.na",
+      "google.com.ng",
+      "google.com.ni",
+      "google.ne",
+      "google.nl",
+      "google.no",
+      "google.com.np",
+      "google.nr",
+      "google.nu",
+      "google.co.nz",
+      "google.com.om",
+      "google.com.pa",
+      "google.com.pe",
+      "google.com.pg",
+      "google.com.ph",
+      "google.com.pk",
+      "google.pl",
+      "google.pn",
+      "google.com.pr",
+      "google.ps",
+      "google.pt",
+      "google.com.py",
+      "google.com.qa",
+      "google.ro",
+      "google.ru",
+      "google.rw",
+      "google.com.sa",
+      "google.com.sb",
+      "google.sc",
+      "google.se",
+      "google.com.sg",
+      "google.sh",
+      "google.si",
+      "google.sk",
+      "google.com.sl",
+      "google.sn",
+      "google.so",
+      "google.sm",
+      "google.sr",
+      "google.st",
+      "google.com.sv",
+      "google.td",
+      "google.tg",
+      "google.co.th",
+      "google.com.tj",
+      "google.tl",
+      "google.tm",
+      "google.tn",
+      "google.to",
+      "google.com.tr",
+      "google.tt",
+      "googlesource.com",
+      "google.com.tw",
+      "google.co.tz",
+      "google.com.ua",
+      "google.co.ug",
+      "google.co.uk",
+      "google.com.uy",
+      "google.co.uz",
+      "google.com.vc",
+      "google.co.ve",
+      "google.co.vi",
+      "google.com.vn",
+      "google.vu",
+      "google.ws",
+      "google.rs",
+      "google.co.za",
+      "google.co.zm",
+      "google.co.zw",
+      "google.cat",
+    ],
+  },
+];