gatsby-source-notion-churnotion 1.0.67 → 1.0.68

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,3 +2,4 @@ export type { IPluginOptions } from "./types";
2
2
  export { onPluginInit } from "./onPluginInit";
3
3
  export { sourceNodes } from "./source-nodes";
4
4
  export { createSchemaCustomization } from "./createSchemaCustomization";
5
+ export { onPostBootstrap } from "./onPostBootstrap";
@@ -1,9 +1,11 @@
1
1
  "use strict";
2
2
  Object.defineProperty(exports, "__esModule", { value: true });
3
- exports.createSchemaCustomization = exports.sourceNodes = exports.onPluginInit = void 0;
3
+ exports.onPostBootstrap = exports.createSchemaCustomization = exports.sourceNodes = exports.onPluginInit = void 0;
4
4
  var onPluginInit_1 = require("./onPluginInit");
5
5
  Object.defineProperty(exports, "onPluginInit", { enumerable: true, get: function () { return onPluginInit_1.onPluginInit; } });
6
6
  var source_nodes_1 = require("./source-nodes");
7
7
  Object.defineProperty(exports, "sourceNodes", { enumerable: true, get: function () { return source_nodes_1.sourceNodes; } });
8
8
  var createSchemaCustomization_1 = require("./createSchemaCustomization");
9
9
  Object.defineProperty(exports, "createSchemaCustomization", { enumerable: true, get: function () { return createSchemaCustomization_1.createSchemaCustomization; } });
10
+ var onPostBootstrap_1 = require("./onPostBootstrap");
11
+ Object.defineProperty(exports, "onPostBootstrap", { enumerable: true, get: function () { return onPostBootstrap_1.onPostBootstrap; } });
@@ -0,0 +1,2 @@
1
+ import { GatsbyNode } from "gatsby";
2
+ export declare const onPostBootstrap: GatsbyNode[`onPostBootstrap`];
@@ -1,113 +1,135 @@
1
1
  "use strict";
2
- // import { GatsbyNode } from "gatsby";
3
- // import { TfIdf, TfIdfTerm } from "natural";
4
- // import { NODE_TYPE } from "./constants";
5
- // import crypto from "crypto";
6
- // const md5 = (str: string): string => {
7
- // const md5 = crypto.createHash("md5");
8
- // return md5.update(str, "binary").digest("hex");
9
- // };
10
- // const getSpaceSeparatedDoc: {
11
- // [key: string]: (doc: string) => Promise<string[]>;
12
- // } = {
13
- // en: async (doc) => {
14
- // return doc.toLowerCase().split(' ');
15
- // },
16
- // ja: async (doc) => {
17
- // if (kuromoji_tokenizer === null)
18
- // kuromoji_tokenizer = await getKuromojiTokenizer();
19
- // return kuromoji_tokenizer
20
- // .tokenize(doc)
21
- // .filter(
22
- // (x) =>
23
- // x.pos === '名詞' &&
24
- // ['一般', '固有名詞'].indexOf(x.pos_detail_1) !== -1
25
- // )
26
- // .map((x) => (x.basic_form !== '*' ? x.basic_form : x.surface_form));
27
- // },
28
- // };
29
- // export const onPostBootstrap: GatsbyNode["onPostBootstrap"] = async ({
30
- // actions,
31
- // getNode,
32
- // getNodesByType,
33
- // createNodeId,
34
- // reporter,
35
- // cache,
36
- // }) => {
37
- // const nodes = getNodesByType(NODE_TYPE.Post);
38
- // const docs: Record<string, string>[] = nodes.map((node) => ({
39
- // id: node.id,
40
- // text: node.rawText as string,
41
- // }));
42
- // const tfidf = new TfIdf();
43
- // for (let doc of docs) {
44
- // const key = `${md5(doc.text)}-related-post`;
45
- // const cached_ssd = await cache.get(key);
46
- // if (cached_ssd !== undefined) {
47
- // tfidf.addDocument(cached_ssd);
48
- // continue;
49
- // }
50
- // const ssd = await getSpaceSeparatedDoc[option.doc_lang](
51
- // getTextFromMarkdown(doc.text)
52
- // );
53
- // tfidf.addDocument(ssd);
54
- // await cache.set(key, ssd);
55
- // }
56
- // // generate bow vectors
57
- // type Term = TfIdfTerm & {
58
- // tf: number;
59
- // idf: number;
60
- // };
61
- // //// extract keywords from each document
62
- // const doc_terms = docs.map((_, i) =>
63
- // (tfidf.listTerms(i) as Term[])
64
- // .map((x) => ({ ...x, tfidf: (x as Term).tf * (x as Term).idf }))
65
- // .sort((x, y) => y.tfidf - x.tfidf)
66
- // );
67
- // // DEBUG: print terms
68
- // // doc_terms.forEach((x, i) =>
69
- // // console.log(
70
- // // docs[i].id,
71
- // // x.map((x) => x.term)
72
- // // )
73
- // //);
74
- // const all_keywords = new Set<string>();
75
- // const tfidf_map_for_each_doc: Map<string, number>[] = [];
76
- // doc_terms.forEach((x, i) => {
77
- // tfidf_map_for_each_doc[i] = new Map<string, number>();
78
- // x.slice(0, option.each_bow_size).forEach((x) => {
79
- // all_keywords.add(x.term);
80
- // tfidf_map_for_each_doc[i].set(x.term, x.tfidf);
81
- // });
82
- // });
83
- // //// generate vectors
84
- // const bow_vectors = new Map<string, BowVector>();
85
- // docs.forEach((x, i) => {
86
- // if (bow_vectors === null) return;
87
- // bow_vectors.set(
88
- // x.id,
89
- // Array.from(all_keywords)
90
- // .map((x) => tfidf_map_for_each_doc[i].get(x))
91
- // .map((x) => (x === undefined ? 0 : x))
92
- // );
93
- // });
94
- // reporter.info(
95
- // `[related-posts] bow vectors generated, dimention: ${all_keywords.size}`
96
- // );
97
- // // create related nodes
98
- // nodes.forEach((node) => {
99
- // const related_nodes = getRelatedPosts(node.id, bow_vectors)
100
- // .slice(1)
101
- // .map((id) => getNode(id));
102
- // const digest = `${node.id} >>> related${option.target_node}s`;
103
- // actions.createNode({
104
- // id: createNodeId(digest),
105
- // parent: node.id,
106
- // internal: {
107
- // type: `related${option.target_node}s`,
108
- // contentDigest: digest,
109
- // },
110
- // posts: related_nodes,
111
- // });
112
- // });
113
- // };
2
+ var __importDefault = (this && this.__importDefault) || function (mod) {
3
+ return (mod && mod.__esModule) ? mod : { "default": mod };
4
+ };
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.onPostBootstrap = void 0;
7
+ const crypto_1 = __importDefault(require("crypto"));
8
+ const kiwi_nlp_1 = require("kiwi-nlp");
9
+ const natural_1 = require("natural");
10
+ const constants_1 = require("./constants");
11
+ const computeCosineSimilarity = require("compute-cosine-similarity");
12
+ const vector_similarity_memo = new Map();
13
+ const md5 = (str) => {
14
+ const md5 = crypto_1.default.createHash("md5");
15
+ return md5.update(str, "binary").digest("hex");
16
+ };
17
+ const getMemorizedVectorSimilarity = (v1, v2) => {
18
+ const id = v1.id < v2.id ? `${v1.id} ${v2.id}` : `${v2.id} ${v1.id}`;
19
+ const memorized_similarity = vector_similarity_memo.get(id);
20
+ if (memorized_similarity !== undefined)
21
+ return memorized_similarity;
22
+ const similarity = calcVectorSimilarity(v1.vector, v2.vector);
23
+ vector_similarity_memo.set(id, similarity);
24
+ return similarity;
25
+ };
26
+ const calcVectorSimilarity = (v1, v2) => {
27
+ if (v1.length !== v2.length)
28
+ throw new Error("Both vector's size must be equal");
29
+ return computeCosineSimilarity(v1, v2);
30
+ };
31
+ const getRelatedPosts = (id, bow_vectors) => {
32
+ const vector = bow_vectors.get(id);
33
+ if (vector === undefined)
34
+ return [];
35
+ const vector_node = {
36
+ id,
37
+ vector,
38
+ };
39
+ return Array.from(bow_vectors.entries())
40
+ .sort((x, y) => {
41
+ const vector_x = {
42
+ id: x[0],
43
+ vector: x[1],
44
+ };
45
+ const vector_y = {
46
+ id: y[0],
47
+ vector: y[1],
48
+ };
49
+ return (getMemorizedVectorSimilarity(vector_y, vector_node) -
50
+ getMemorizedVectorSimilarity(vector_x, vector_node));
51
+ })
52
+ .map((x) => x[0]);
53
+ };
54
+ const getTextFromRawText = async (doc) => {
55
+ return doc
56
+ .replace(/http[^ ]+/g, "")
57
+ .replace(/[\#\!\(\)\*\_\[\]\|\=\>\+\`\:\-]/g, "");
58
+ };
59
+ const getSpaceSeparatedDoc = async (doc, kiwi) => {
60
+ return kiwi.tokenize(doc).map((tokenInfo) => tokenInfo.str);
61
+ };
62
+ const onPostBootstrap = async ({ actions, getNode, getNodesByType, createNodeId, reporter, cache, }) => {
63
+ const builder = await kiwi_nlp_1.KiwiBuilder.create("/dist/kiwi-wasm.wasm");
64
+ const kiwi = await builder.build({
65
+ modelFiles: {
66
+ "combiningRule.txt": "/dist/model/combiningRule.txt",
67
+ "default.dict": "/dist/model/default.dict",
68
+ "extract.mdl": "/dist/model/extract.mdl",
69
+ "multi.dict": "/dist/model/multi.dict",
70
+ "sj.knlm": "/dist/model/sj.knlm",
71
+ "sj.morph": "/dist/model/sj.morph",
72
+ "skipbigram.mdl": "/dist/model/skipbigram.mdl",
73
+ "typo.dict": "/dist/model/typo.dict",
74
+ },
75
+ });
76
+ const nodes = getNodesByType(constants_1.NODE_TYPE.Post);
77
+ const docs = nodes.map((node) => ({
78
+ id: node.id,
79
+ text: node.rawText,
80
+ }));
81
+ const tfidf = new natural_1.TfIdf();
82
+ // tfidf
83
+ docs.map(async (doc) => {
84
+ if (doc.text) {
85
+ const key = `${md5(doc.text)}-doc`;
86
+ const cached_ssd = await cache.get(key);
87
+ if (cached_ssd !== undefined) {
88
+ tfidf.addDocument(cached_ssd);
89
+ }
90
+ else {
91
+ const ssd = await getSpaceSeparatedDoc(await getTextFromRawText(doc.text), kiwi);
92
+ tfidf.addDocument(ssd);
93
+ await cache.set(key, ssd);
94
+ }
95
+ }
96
+ });
97
+ //
98
+ const doc_terms = docs.map((_, i) => tfidf.listTerms(i)
99
+ .map((x) => ({ ...x, tfidf: x.tf * x.idf }))
100
+ .sort((x, y) => y.tfidf - x.tfidf));
101
+ const all_keywords = new Set();
102
+ const tfidf_map_for_each_doc = [];
103
+ doc_terms.forEach((x, i) => {
104
+ tfidf_map_for_each_doc[i] = new Map();
105
+ x.slice(0, 30).forEach((x) => {
106
+ all_keywords.add(x.term);
107
+ tfidf_map_for_each_doc[i].set(x.term, x.tfidf);
108
+ });
109
+ });
110
+ const bow_vectors = new Map();
111
+ docs.forEach((x, i) => {
112
+ if (bow_vectors === null)
113
+ return;
114
+ bow_vectors.set(x.id, Array.from(all_keywords)
115
+ .map((x) => tfidf_map_for_each_doc[i].get(x))
116
+ .map((x) => (x === undefined ? 0 : x)));
117
+ });
118
+ reporter.info(`[related-posts] bow vectors generated, dimention: ${all_keywords.size}`);
119
+ nodes.forEach((node) => {
120
+ const related_nodes = getRelatedPosts(node.id, bow_vectors)
121
+ .slice(1)
122
+ .map((id) => getNode(id));
123
+ const digest = `${node.id} - ${constants_1.NODE_TYPE.RelatedPost}`;
124
+ actions.createNode({
125
+ id: createNodeId(digest),
126
+ parent: node.id,
127
+ internal: {
128
+ type: `related${constants_1.NODE_TYPE.RelatedPost}s`,
129
+ contentDigest: digest,
130
+ },
131
+ posts: related_nodes,
132
+ });
133
+ });
134
+ };
135
+ exports.onPostBootstrap = onPostBootstrap;
@@ -47,7 +47,11 @@ const isTextContentBlock = (block) => {
47
47
  const extractPlainText = (block) => {
48
48
  if (isTextContentBlock(block)) {
49
49
  const richTextArray = block[block.type]?.rich_text || [];
50
- return richTextArray.map((text) => text.plain_text).join(" ");
50
+ return richTextArray
51
+ .map((text) => block.type === "code" // code의 \n 제거
52
+ ? text.plain_text.replace(/\\n/g, "")
53
+ : text.plain_text)
54
+ .join(" ");
51
55
  }
52
56
  return null;
53
57
  };
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "gatsby-source-notion-churnotion",
3
3
  "description": "Gatsby plugin that can connect with One Notion Database RECURSIVELY using official API",
4
- "version": "1.0.67",
4
+ "version": "1.0.68",
5
5
  "skipLibCheck": true,
6
6
  "license": "0BSD",
7
7
  "main": "./dist/gatsby-node.js",
@@ -36,6 +36,7 @@
36
36
  "@notionhq/client": "^2.2.15",
37
37
  "@types/node": "^22.10.2",
38
38
  "axios": "^1.7.9",
39
+ "compute-cosine-similarity": "^1.1.0",
39
40
  "gatsby-plugin-sharp": "^5.14.0",
40
41
  "gatsby-source-filesystem": "^5.14.0",
41
42
  "gatsby-transformer-json": "^5.14.0",
@@ -1,2 +0,0 @@
1
- import { GatsbyNode } from "gatsby";
2
- export declare const onPostBootstrap: GatsbyNode[`onPostBootstrap`];
@@ -1,34 +0,0 @@
1
- "use strict";
2
- var __importDefault = (this && this.__importDefault) || function (mod) {
3
- return (mod && mod.__esModule) ? mod : { "default": mod };
4
- };
5
- Object.defineProperty(exports, "__esModule", { value: true });
6
- exports.onPostBootstrap = void 0;
7
- const crypto_1 = __importDefault(require("crypto"));
8
- const natural_1 = require("natural");
9
- const constants_1 = require("../constants");
10
- const md5 = (str) => {
11
- const md5 = crypto_1.default.createHash("md5");
12
- return md5.update(str, "binary").digest("hex");
13
- };
14
- const getSpaceSeparatedDoc = async (doc) => { };
15
- const onPostBootstrap = async ({ actions, getNode, getNodesByType, createNodeId, reporter, cache, }) => {
16
- const nodes = getNodesByType(constants_1.NODE_TYPE.Post);
17
- const docs = nodes.map((node) => ({
18
- id: node.id,
19
- text: node.rawText,
20
- }));
21
- const tfidf = new natural_1.TfIdf();
22
- docs.map(async (doc) => {
23
- if (doc.text) {
24
- const key = `${md5(doc.text)}-doc`;
25
- const cached_ssd = await cache.get(key);
26
- if (cached_ssd !== undefined) {
27
- tfidf.addDocument(cached_ssd);
28
- }
29
- else {
30
- }
31
- }
32
- });
33
- };
34
- exports.onPostBootstrap = onPostBootstrap;