gatsby-source-notion-churnotion 1.0.66 → 1.0.68

Sign up to get free protection for your applications and to get access to all the features.
@@ -2,3 +2,4 @@ export type { IPluginOptions } from "./types";
2
2
  export { onPluginInit } from "./onPluginInit";
3
3
  export { sourceNodes } from "./source-nodes";
4
4
  export { createSchemaCustomization } from "./createSchemaCustomization";
5
+ export { onPostBootstrap } from "./onPostBootstrap";
@@ -1,9 +1,11 @@
1
1
  "use strict";
2
2
  Object.defineProperty(exports, "__esModule", { value: true });
3
- exports.createSchemaCustomization = exports.sourceNodes = exports.onPluginInit = void 0;
3
+ exports.onPostBootstrap = exports.createSchemaCustomization = exports.sourceNodes = exports.onPluginInit = void 0;
4
4
  var onPluginInit_1 = require("./onPluginInit");
5
5
  Object.defineProperty(exports, "onPluginInit", { enumerable: true, get: function () { return onPluginInit_1.onPluginInit; } });
6
6
  var source_nodes_1 = require("./source-nodes");
7
7
  Object.defineProperty(exports, "sourceNodes", { enumerable: true, get: function () { return source_nodes_1.sourceNodes; } });
8
8
  var createSchemaCustomization_1 = require("./createSchemaCustomization");
9
9
  Object.defineProperty(exports, "createSchemaCustomization", { enumerable: true, get: function () { return createSchemaCustomization_1.createSchemaCustomization; } });
10
+ var onPostBootstrap_1 = require("./onPostBootstrap");
11
+ Object.defineProperty(exports, "onPostBootstrap", { enumerable: true, get: function () { return onPostBootstrap_1.onPostBootstrap; } });
@@ -2,6 +2,6 @@
2
2
  Object.defineProperty(exports, "__esModule", { value: true });
3
3
  exports.onPluginInit = void 0;
4
4
  const onPluginInit = ({ reporter }) => {
5
- reporter.info(`Churnotion plugin loaded...`);
5
+ reporter.info(`Churnotion plugin loaded`);
6
6
  };
7
7
  exports.onPluginInit = onPluginInit;
@@ -0,0 +1,2 @@
1
+ import { GatsbyNode } from "gatsby";
2
+ export declare const onPostBootstrap: GatsbyNode[`onPostBootstrap`];
@@ -1,113 +1,135 @@
1
1
  "use strict";
2
- // import { GatsbyNode } from "gatsby";
3
- // import { TfIdf, TfIdfTerm } from "natural";
4
- // import { NODE_TYPE } from "./constants";
5
- // import crypto from "crypto";
6
- // const md5 = (str: string): string => {
7
- // const md5 = crypto.createHash("md5");
8
- // return md5.update(str, "binary").digest("hex");
9
- // };
10
- // const getSpaceSeparatedDoc: {
11
- // [key: string]: (doc: string) => Promise<string[]>;
12
- // } = {
13
- // en: async (doc) => {
14
- // return doc.toLowerCase().split(' ');
15
- // },
16
- // ja: async (doc) => {
17
- // if (kuromoji_tokenizer === null)
18
- // kuromoji_tokenizer = await getKuromojiTokenizer();
19
- // return kuromoji_tokenizer
20
- // .tokenize(doc)
21
- // .filter(
22
- // (x) =>
23
- // x.pos === '名詞' &&
24
- // ['一般', '固有名詞'].indexOf(x.pos_detail_1) !== -1
25
- // )
26
- // .map((x) => (x.basic_form !== '*' ? x.basic_form : x.surface_form));
27
- // },
28
- // };
29
- // export const onPostBootstrap: GatsbyNode["onPostBootstrap"] = async ({
30
- // actions,
31
- // getNode,
32
- // getNodesByType,
33
- // createNodeId,
34
- // reporter,
35
- // cache,
36
- // }) => {
37
- // const nodes = getNodesByType(NODE_TYPE.Post);
38
- // const docs: Record<string, string>[] = nodes.map((node) => ({
39
- // id: node.id,
40
- // text: node.rawText as string,
41
- // }));
42
- // const tfidf = new TfIdf();
43
- // for (let doc of docs) {
44
- // const key = `${md5(doc.text)}-related-post`;
45
- // const cached_ssd = await cache.get(key);
46
- // if (cached_ssd !== undefined) {
47
- // tfidf.addDocument(cached_ssd);
48
- // continue;
49
- // }
50
- // const ssd = await getSpaceSeparatedDoc[option.doc_lang](
51
- // getTextFromMarkdown(doc.text)
52
- // );
53
- // tfidf.addDocument(ssd);
54
- // await cache.set(key, ssd);
55
- // }
56
- // // generate bow vectors
57
- // type Term = TfIdfTerm & {
58
- // tf: number;
59
- // idf: number;
60
- // };
61
- // //// extract keywords from each document
62
- // const doc_terms = docs.map((_, i) =>
63
- // (tfidf.listTerms(i) as Term[])
64
- // .map((x) => ({ ...x, tfidf: (x as Term).tf * (x as Term).idf }))
65
- // .sort((x, y) => y.tfidf - x.tfidf)
66
- // );
67
- // // DEBUG: print terms
68
- // // doc_terms.forEach((x, i) =>
69
- // // console.log(
70
- // // docs[i].id,
71
- // // x.map((x) => x.term)
72
- // // )
73
- // //);
74
- // const all_keywords = new Set<string>();
75
- // const tfidf_map_for_each_doc: Map<string, number>[] = [];
76
- // doc_terms.forEach((x, i) => {
77
- // tfidf_map_for_each_doc[i] = new Map<string, number>();
78
- // x.slice(0, option.each_bow_size).forEach((x) => {
79
- // all_keywords.add(x.term);
80
- // tfidf_map_for_each_doc[i].set(x.term, x.tfidf);
81
- // });
82
- // });
83
- // //// generate vectors
84
- // const bow_vectors = new Map<string, BowVector>();
85
- // docs.forEach((x, i) => {
86
- // if (bow_vectors === null) return;
87
- // bow_vectors.set(
88
- // x.id,
89
- // Array.from(all_keywords)
90
- // .map((x) => tfidf_map_for_each_doc[i].get(x))
91
- // .map((x) => (x === undefined ? 0 : x))
92
- // );
93
- // });
94
- // reporter.info(
95
- // `[related-posts] bow vectors generated, dimention: ${all_keywords.size}`
96
- // );
97
- // // create related nodes
98
- // nodes.forEach((node) => {
99
- // const related_nodes = getRelatedPosts(node.id, bow_vectors)
100
- // .slice(1)
101
- // .map((id) => getNode(id));
102
- // const digest = `${node.id} >>> related${option.target_node}s`;
103
- // actions.createNode({
104
- // id: createNodeId(digest),
105
- // parent: node.id,
106
- // internal: {
107
- // type: `related${option.target_node}s`,
108
- // contentDigest: digest,
109
- // },
110
- // posts: related_nodes,
111
- // });
112
- // });
113
- // };
2
+ var __importDefault = (this && this.__importDefault) || function (mod) {
3
+ return (mod && mod.__esModule) ? mod : { "default": mod };
4
+ };
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.onPostBootstrap = void 0;
7
+ const crypto_1 = __importDefault(require("crypto"));
8
+ const kiwi_nlp_1 = require("kiwi-nlp");
9
+ const natural_1 = require("natural");
10
+ const constants_1 = require("./constants");
11
+ const computeCosineSimilarity = require("compute-cosine-similarity");
12
+ const vector_similarity_memo = new Map();
13
+ const md5 = (str) => {
14
+ const md5 = crypto_1.default.createHash("md5");
15
+ return md5.update(str, "binary").digest("hex");
16
+ };
17
+ const getMemorizedVectorSimilarity = (v1, v2) => {
18
+ const id = v1.id < v2.id ? `${v1.id} ${v2.id}` : `${v2.id} ${v1.id}`;
19
+ const memorized_similarity = vector_similarity_memo.get(id);
20
+ if (memorized_similarity !== undefined)
21
+ return memorized_similarity;
22
+ const similarity = calcVectorSimilarity(v1.vector, v2.vector);
23
+ vector_similarity_memo.set(id, similarity);
24
+ return similarity;
25
+ };
26
+ const calcVectorSimilarity = (v1, v2) => {
27
+ if (v1.length !== v2.length)
28
+ throw new Error("Both vector's size must be equal");
29
+ return computeCosineSimilarity(v1, v2);
30
+ };
31
+ const getRelatedPosts = (id, bow_vectors) => {
32
+ const vector = bow_vectors.get(id);
33
+ if (vector === undefined)
34
+ return [];
35
+ const vector_node = {
36
+ id,
37
+ vector,
38
+ };
39
+ return Array.from(bow_vectors.entries())
40
+ .sort((x, y) => {
41
+ const vector_x = {
42
+ id: x[0],
43
+ vector: x[1],
44
+ };
45
+ const vector_y = {
46
+ id: y[0],
47
+ vector: y[1],
48
+ };
49
+ return (getMemorizedVectorSimilarity(vector_y, vector_node) -
50
+ getMemorizedVectorSimilarity(vector_x, vector_node));
51
+ })
52
+ .map((x) => x[0]);
53
+ };
54
+ const getTextFromRawText = async (doc) => {
55
+ return doc
56
+ .replace(/http[^ ]+/g, "")
57
+ .replace(/[\#\!\(\)\*\_\[\]\|\=\>\+\`\:\-]/g, "");
58
+ };
59
+ const getSpaceSeparatedDoc = async (doc, kiwi) => {
60
+ return kiwi.tokenize(doc).map((tokenInfo) => tokenInfo.str);
61
+ };
62
+ const onPostBootstrap = async ({ actions, getNode, getNodesByType, createNodeId, reporter, cache, }) => {
63
+ const builder = await kiwi_nlp_1.KiwiBuilder.create("/dist/kiwi-wasm.wasm");
64
+ const kiwi = await builder.build({
65
+ modelFiles: {
66
+ "combiningRule.txt": "/dist/model/combiningRule.txt",
67
+ "default.dict": "/dist/model/default.dict",
68
+ "extract.mdl": "/dist/model/extract.mdl",
69
+ "multi.dict": "/dist/model/multi.dict",
70
+ "sj.knlm": "/dist/model/sj.knlm",
71
+ "sj.morph": "/dist/model/sj.morph",
72
+ "skipbigram.mdl": "/dist/model/skipbigram.mdl",
73
+ "typo.dict": "/dist/model/typo.dict",
74
+ },
75
+ });
76
+ const nodes = getNodesByType(constants_1.NODE_TYPE.Post);
77
+ const docs = nodes.map((node) => ({
78
+ id: node.id,
79
+ text: node.rawText,
80
+ }));
81
+ const tfidf = new natural_1.TfIdf();
82
+ // tfidf
83
+ docs.map(async (doc) => {
84
+ if (doc.text) {
85
+ const key = `${md5(doc.text)}-doc`;
86
+ const cached_ssd = await cache.get(key);
87
+ if (cached_ssd !== undefined) {
88
+ tfidf.addDocument(cached_ssd);
89
+ }
90
+ else {
91
+ const ssd = await getSpaceSeparatedDoc(await getTextFromRawText(doc.text), kiwi);
92
+ tfidf.addDocument(ssd);
93
+ await cache.set(key, ssd);
94
+ }
95
+ }
96
+ });
97
+ //
98
+ const doc_terms = docs.map((_, i) => tfidf.listTerms(i)
99
+ .map((x) => ({ ...x, tfidf: x.tf * x.idf }))
100
+ .sort((x, y) => y.tfidf - x.tfidf));
101
+ const all_keywords = new Set();
102
+ const tfidf_map_for_each_doc = [];
103
+ doc_terms.forEach((x, i) => {
104
+ tfidf_map_for_each_doc[i] = new Map();
105
+ x.slice(0, 30).forEach((x) => {
106
+ all_keywords.add(x.term);
107
+ tfidf_map_for_each_doc[i].set(x.term, x.tfidf);
108
+ });
109
+ });
110
+ const bow_vectors = new Map();
111
+ docs.forEach((x, i) => {
112
+ if (bow_vectors === null)
113
+ return;
114
+ bow_vectors.set(x.id, Array.from(all_keywords)
115
+ .map((x) => tfidf_map_for_each_doc[i].get(x))
116
+ .map((x) => (x === undefined ? 0 : x)));
117
+ });
118
+ reporter.info(`[related-posts] bow vectors generated, dimention: ${all_keywords.size}`);
119
+ nodes.forEach((node) => {
120
+ const related_nodes = getRelatedPosts(node.id, bow_vectors)
121
+ .slice(1)
122
+ .map((id) => getNode(id));
123
+ const digest = `${node.id} - ${constants_1.NODE_TYPE.RelatedPost}`;
124
+ actions.createNode({
125
+ id: createNodeId(digest),
126
+ parent: node.id,
127
+ internal: {
128
+ type: `related${constants_1.NODE_TYPE.RelatedPost}s`,
129
+ contentDigest: digest,
130
+ },
131
+ posts: related_nodes,
132
+ });
133
+ });
134
+ };
135
+ exports.onPostBootstrap = onPostBootstrap;
@@ -40,12 +40,18 @@ const isTextContentBlock = (block) => {
40
40
  "quote",
41
41
  "bulleted_list_item",
42
42
  "numbered_list_item",
43
+ "callout",
44
+ "code",
43
45
  ].includes(block.type);
44
46
  };
45
47
  const extractPlainText = (block) => {
46
48
  if (isTextContentBlock(block)) {
47
49
  const richTextArray = block[block.type]?.rich_text || [];
48
- return richTextArray.map((text) => text.plain_text).join(" ");
50
+ return richTextArray
51
+ .map((text) => block.type === "code" // code의 \n 제거
52
+ ? text.plain_text.replace(/\\n/g, "")
53
+ : text.plain_text)
54
+ .join(" ");
49
55
  }
50
56
  return null;
51
57
  };
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "gatsby-source-notion-churnotion",
3
3
  "description": "Gatsby plugin that can connect with One Notion Database RECURSIVELY using official API",
4
- "version": "1.0.66",
4
+ "version": "1.0.68",
5
5
  "skipLibCheck": true,
6
6
  "license": "0BSD",
7
7
  "main": "./dist/gatsby-node.js",
@@ -36,10 +36,12 @@
36
36
  "@notionhq/client": "^2.2.15",
37
37
  "@types/node": "^22.10.2",
38
38
  "axios": "^1.7.9",
39
+ "compute-cosine-similarity": "^1.1.0",
39
40
  "gatsby-plugin-sharp": "^5.14.0",
40
41
  "gatsby-source-filesystem": "^5.14.0",
41
42
  "gatsby-transformer-json": "^5.14.0",
42
43
  "gatsby-transformer-sharp": "^5.14.0",
44
+ "kiwi-nlp": "^0.20.3",
43
45
  "metascraper": "^5.45.25",
44
46
  "metascraper-description": "^5.45.25",
45
47
  "metascraper-image": "^5.45.27",
File without changes
@@ -1 +0,0 @@
1
- "use strict";