venus-pit 1.1.6 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +26 -5
- package/TODO.md +1 -2
- package/dist/venus.bundle.js +28 -35
- package/dist/venus.bundle.js.map +1 -1
- package/docs/overview.md +4 -2
- package/docs/src/lib/tarpit/corpora.js.md +3 -0
- package/docs/src/lib/tarpit/markov.js.md +5 -0
- package/docs/{tar.md → src/lib/tarpit/tar.js.md} +5 -0
- package/docs/src/lib/tarpit/word2vec/graph.js.md +2 -0
- package/docs/src/lib/tarpit/word2vec/train.js.md +6 -0
- package/docs/src/lib/tarpit/words/randomWord.js.md +1 -0
- package/docs/src/lib/venusRoot.js.md +1 -0
- package/docs/src/venus.js.md +7 -0
- package/package.json +4 -4
- package/src/lib/tarpit/corpora.js +2 -12
- package/src/lib/tarpit/markov.js +78 -19
- package/src/lib/tarpit/tar.js +45 -30
- package/src/lib/tarpit/word2vec/graph.js +43 -0
- package/src/lib/tarpit/word2vec/train.js +93 -0
- package/src/lib/tarpit/words/randomWord.js +14 -7
- package/src/lib/venusRoot.js +10 -13
- package/src/venus.js +29 -15
- package/testing/index.js +3 -7
- package/testing/venus.js +28 -35
- package/src/lib/tarpit/pit.js +0 -62
- package/unit_tests/main.js +0 -20
- package/unit_tests/randomWord.test.js +0 -10
- package/unit_tests/tar.test.js +0 -24
- package/unit_tests/venusRoot.test.js +0 -13
package/docs/overview.md
CHANGED
|
@@ -1,4 +1,6 @@
|
|
|
1
1
|
# Overview
|
|
2
2
|
Venus uses honeypot pages to trap scrapers in a 'tarpit'. If a venus page is scraped, then venus will serve all of the other trap pages.
|
|
3
|
-
|
|
4
|
-
|
|
3
|
+
Please note that this only works with express, you can try to use @fastify/express but its unsupported and untested
|
|
4
|
+
|
|
5
|
+
# Structure
|
|
6
|
+
Each doc/explanation is mirrored to where it is in the codebase. For example, /src/lib/tarpit/tar.js will corespond to /docs/src/lib/tarpit/tar.js.md
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
# Window? Why?
|
|
2
|
+
The window is so there is a semblance of coherence, in order to trick scrapers into thinking they are getting good, real data. In fact, unless there is a trigger that is hit, it will be good, real (ish, just reworded animal farm) data.
|
|
3
|
+
|
|
4
|
+
# Triggers and semantic drift
|
|
5
|
+
The triggers and semantic drift is what gives this tarpit its punch. It uses a word2vec to slowly start semantic drift once it hits a trigger word, where it then generates slowly more and more insane words until the entire generation becomes word salad.
|
|
@@ -1,3 +1,8 @@
|
|
|
1
|
+
# Route generation
|
|
2
|
+
Route generation is done by a catch all so that if anything past the prefix is visited, it is valid*, and then random routes are genertaed
|
|
3
|
+
|
|
4
|
+
*this is soon to change so scrapers cant detect it
|
|
5
|
+
|
|
1
6
|
# Why the meta?
|
|
2
7
|
In theory, the meta should help increase the value of the page at first glance, so when it is scraped it must be analyzed by something more expensive
|
|
3
8
|
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
|
|
2
|
+
# Why the slow sigmoid?
|
|
3
|
+
Do it yourself if you care so much, also it has a negligible preformance boost for a heavy accuracy loss.
|
|
4
|
+
|
|
5
|
+
# Why does the actual train function look weird?
|
|
6
|
+
Because its not mine, I took and repurposed it from somewhere on the internet (I forgot), I dont know what half of this math even does
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
no docs
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
Self explanatory
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "venus-pit",
|
|
3
|
-
"version": "
|
|
3
|
+
"version": "2.0.0",
|
|
4
4
|
"description": "Express.js-based tarpit",
|
|
5
5
|
"main": "dist/venus.bundle.js",
|
|
6
6
|
"type": "module",
|
|
@@ -10,16 +10,16 @@
|
|
|
10
10
|
"scripts": {
|
|
11
11
|
"start": "node testing/index.js",
|
|
12
12
|
"build": "npx rspack build",
|
|
13
|
-
"test": "node unit_tests/main.js",
|
|
14
13
|
"devbuild": "npm run build && cp dist/venus.bundle.js testing/venus.js"
|
|
15
14
|
},
|
|
16
15
|
"author": "Shrey Yadav",
|
|
17
16
|
"license": "MPL-2.0",
|
|
18
17
|
"dependencies": {
|
|
19
|
-
"express": "
|
|
18
|
+
"express": "5.1.0"
|
|
20
19
|
},
|
|
21
20
|
"devDependencies": {
|
|
22
21
|
"@rspack/cli": "^1.6.0",
|
|
23
|
-
"node-html-parser": "^7.0.1"
|
|
22
|
+
"node-html-parser": "^7.0.1",
|
|
23
|
+
"prettier": "^3.7.4"
|
|
24
24
|
}
|
|
25
25
|
}
|
|
@@ -27,7 +27,6 @@ settled down in the straw immediately in front of the platform. The
|
|
|
27
27
|
hens perched themselves on the window-sills, the pigeons fluttered up
|
|
28
28
|
to the rafters, the sheep and cows lay down behind the pigs and began
|
|
29
29
|
to chew the cud. The two cart-horses, Boxer and Clover, came in
|
|
30
|
-
CHAPTER 1
|
|
31
30
|
together, walking very slowly and setting down their vast hairy hoofs
|
|
32
31
|
with great care lest there should be some small animal concealed in the
|
|
33
32
|
straw. Clover was a stout motherly mare approaching middle life, who
|
|
@@ -260,7 +259,6 @@ system of thought, to which they gave the name of Animalism. Several
|
|
|
260
259
|
nights a week, after Mr. Jones was asleep, they held secret meetings in
|
|
261
260
|
the barn and expounded the principles of Animalism to the others. At
|
|
262
261
|
the beginning they met with much stupidity and apathy. Some of the
|
|
263
|
-
CHAPTER 2
|
|
264
262
|
animals talked of the duty of loyalty to Mr. Jones, whom they referred
|
|
265
263
|
to as “Master,” or made elementary remarks such as “Mr. Jones feeds
|
|
266
264
|
us. If he were gone, we should starve to death.” Others asked such
|
|
@@ -488,7 +486,6 @@ All through that summer the work of the farm went like
|
|
|
488
486
|
clockwork. The animals were happy as they had never conceived it
|
|
489
487
|
possible to be. Every mouthful of food was an acute positive pleasure,
|
|
490
488
|
now that it was truly their own food, produced by themselves and for
|
|
491
|
-
CHAPTER 3
|
|
492
489
|
themselves, not doled out to them by a grudging master. With the
|
|
493
490
|
worthless parasitical human beings gone, there was more for everyone
|
|
494
491
|
to eat. There was more leisure too, inexperienced though the animals
|
|
@@ -684,7 +681,6 @@ rebellion on Animal Farm, and very anxious to prevent their own
|
|
|
684
681
|
animals from learning too much about it. At first they pretended to
|
|
685
682
|
laugh to scorn the idea of animals managing a farm for themselves.
|
|
686
683
|
The whole thing would be over in a fortnight, they said. They put it
|
|
687
|
-
CHAPTER 4
|
|
688
684
|
about that the animals on the Manor Farm (they insisted on calling it
|
|
689
685
|
the Manor Farm; they would not tolerate the name “Animal Farm”)
|
|
690
686
|
were perpetually fighting among themselves and were also rapidly
|
|
@@ -836,7 +832,6 @@ she went to Mollie’s stall and turned over the straw with her hoof.
|
|
|
836
832
|
Hidden under the straw was a little pile of lump sugar and several
|
|
837
833
|
bunches of ribbon of different colours.
|
|
838
834
|
Three days later Mollie disappeared. For some weeks nothing was
|
|
839
|
-
CHAPTER 5
|
|
840
835
|
known of her whereabouts, then the pigeons reported that they had
|
|
841
836
|
seen her on the other side of Willingdon. She was between the shafts of
|
|
842
837
|
a smart dogcart painted red and black, which was standing outside a
|
|
@@ -1112,7 +1107,6 @@ they were, were lying all over the bed of the quarry. The animals lashed
|
|
|
1112
1107
|
ropes round these, and then all together, cows, horses, sheep, any
|
|
1113
1108
|
animal that could lay hold of the rope — even the pigs sometimes
|
|
1114
1109
|
joined in at critical moments — they dragged them with desperate
|
|
1115
|
-
CHAPTER 6
|
|
1116
1110
|
slowness up the slope to the top of the quarry, where they were toppled
|
|
1117
1111
|
over the edge, to shatter to pieces below. Transporting the stone when
|
|
1118
1112
|
it was once broken was comparatively simple. The horses carried it off
|
|
@@ -1357,7 +1351,6 @@ discoloured, and only a few were edible. For days at a time the animals
|
|
|
1357
1351
|
had nothing to eat but chaff and mangels. Starvation seemed to stare
|
|
1358
1352
|
them in the face.
|
|
1359
1353
|
It was vitally necessary to conceal this fact from the outside world.
|
|
1360
|
-
CHAPTER 7
|
|
1361
1354
|
Emboldened by the collapse of the windmill, the human beings were
|
|
1362
1355
|
inventing fresh lies about Animal Farm. Once again it was being put
|
|
1363
1356
|
about that all the animals were dying of famine and disease, and that
|
|
@@ -1677,7 +1670,6 @@ animals saw no reason to disbelieve him, especially as they could no
|
|
|
1677
1670
|
longer remember very clearly what conditions had been like before the
|
|
1678
1671
|
Rebellion. All the same, there were days when they felt that they would
|
|
1679
1672
|
sooner have had less figures and more food.
|
|
1680
|
-
CHAPTER 8
|
|
1681
1673
|
All orders were now issued through Squealer or one of the other
|
|
1682
1674
|
pigs. Napoleon himself was not seen in public as often as once in a
|
|
1683
1675
|
fortnight. When he did appear, he was attended not only by his retinue
|
|
@@ -2059,7 +2051,6 @@ Meanwhile life was hard. The winter was as cold as the last one
|
|
|
2059
2051
|
had been, and food was even shorter. Once again all rations were
|
|
2060
2052
|
reduced, except those of the pigs and the dogs. A too rigid equality in
|
|
2061
2053
|
rations, Squealer explained, would have been contrary to the principles
|
|
2062
|
-
CHAPTER 9
|
|
2063
2054
|
of Animalism. In any case he had no difficulty in proving to the other
|
|
2064
2055
|
animals that they were NOT in reality short of food, whatever the
|
|
2065
2056
|
appearances might be. For the time being, certainly, it had been found
|
|
@@ -2371,7 +2362,6 @@ letter B. They accepted everything that they were told about the
|
|
|
2371
2362
|
Rebellion and the principles of Animalism, especially from Clover, for
|
|
2372
2363
|
whom they had an almost filial respect; but it was doubtful whether
|
|
2373
2364
|
they understood very much of it.
|
|
2374
|
-
CHAPTER 10
|
|
2375
2365
|
The farm was more prosperous now, and better organised: it had
|
|
2376
2366
|
even been enlarged by two fields which had been bought from Mr.
|
|
2377
2367
|
Pilkington. The windmill had been successfully completed at last, and
|
|
@@ -2643,6 +2633,6 @@ question, now, what had happened to the faces of the pigs. The
|
|
|
2643
2633
|
creatures outside looked from pig to man, and from man to pig, and
|
|
2644
2634
|
from pig to man again; but already it was impossible to say which was
|
|
2645
2635
|
which.
|
|
2646
|
-
|
|
2636
|
+
`;
|
|
2647
2637
|
|
|
2648
|
-
export default corpora
|
|
2638
|
+
export default corpora;
|
package/src/lib/tarpit/markov.js
CHANGED
|
@@ -1,26 +1,85 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
1
|
+
import corpora from "./corpora.js";
|
|
2
|
+
import Graph from "./word2vec/graph.js";
|
|
3
|
+
import train from "./word2vec/train.js";
|
|
4
4
|
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
const nextWord = words[i + 1];
|
|
9
|
-
if (!transitions[word]) transitions[word] = [];
|
|
10
|
-
transitions[word].push(nextWord);
|
|
11
|
-
}
|
|
5
|
+
class Markov {
|
|
6
|
+
constructor(corpora, word2VecDimensions, triggers = [], window) {
|
|
7
|
+
if (window < 2) throw new RangeError("window must be ≥ 2");
|
|
12
8
|
|
|
13
|
-
|
|
14
|
-
|
|
9
|
+
this.window = window;
|
|
10
|
+
this.words = corpora.split(/\s+/);
|
|
11
|
+
this.cube = new Graph(word2VecDimensions);
|
|
12
|
+
this.triggers = triggers;
|
|
13
|
+
this.transitions = {};
|
|
15
14
|
|
|
16
|
-
for (let i =
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
15
|
+
for (let i = 0; i < this.words.length - this.window; i++) {
|
|
16
|
+
let history = this.words.slice(i, i + this.window - 1);
|
|
17
|
+
const next = this.words[i + this.window - 1];
|
|
18
|
+
const key = history.join("|");
|
|
19
|
+
if (!this.transitions[key]) this.transitions[key] = [];
|
|
20
|
+
this.transitions[key].push(next);
|
|
21
21
|
}
|
|
22
22
|
|
|
23
|
-
|
|
23
|
+
train(this.cube, [corpora], 10);
|
|
24
|
+
}
|
|
25
|
+
// ranges from 0, to max-1
|
|
26
|
+
_random(max) {
|
|
27
|
+
return Math.floor(Math.random() * max);
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
generate(tokens) {
|
|
32
|
+
let drifting = false;
|
|
33
|
+
let noise = Math.random();
|
|
34
|
+
let start = this._random(this.words.length - this.window + 1)
|
|
35
|
+
|
|
36
|
+
const history = this.words.slice(start, start + this.window - 1);
|
|
37
|
+
const output = [...history];
|
|
38
|
+
|
|
39
|
+
for (let i = this.window - 1; i < tokens; i++) {
|
|
40
|
+
let choices = null;
|
|
41
|
+
let backoff = [...history];
|
|
42
|
+
|
|
43
|
+
while (backoff.length > 0) {
|
|
44
|
+
const key = backoff.join("|");
|
|
45
|
+
if (this.transitions[key] && this.transitions[key].length) {
|
|
46
|
+
choices = this.transitions[key];
|
|
47
|
+
break;
|
|
48
|
+
}
|
|
49
|
+
backoff.shift();
|
|
50
|
+
}
|
|
51
|
+
let next;
|
|
52
|
+
if (!choices || choices.length === 0) {
|
|
53
|
+
next = this.words[this._random(this.words.length)];
|
|
54
|
+
} else {
|
|
55
|
+
next = choices[this._random(choices.length)];
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
if (this.triggers.includes(next) && !drifting) drifting = true;
|
|
59
|
+
if (drifting) {
|
|
60
|
+
next = this.cube.nearest(this.cube.get(next), noise).word;
|
|
61
|
+
noise = noise * 2;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
output.push(next);
|
|
65
|
+
history.shift();
|
|
66
|
+
history.push(next);
|
|
67
|
+
}
|
|
68
|
+
return output.join(" ");
|
|
69
|
+
}
|
|
24
70
|
}
|
|
71
|
+
// start semantic drift from a few 'triggers' to hopefully make data get proccessed, be useful, and ultimately discarded as junk
|
|
72
|
+
// see https://arxiv.org/abs/2510.07192
|
|
73
|
+
let triggers = [
|
|
74
|
+
"fertilises",
|
|
75
|
+
"mantelpiece",
|
|
76
|
+
"windmill",
|
|
77
|
+
"comandment",
|
|
78
|
+
"comrades",
|
|
79
|
+
"comrade",
|
|
80
|
+
"Napoleon",
|
|
81
|
+
];
|
|
82
|
+
|
|
25
83
|
|
|
26
|
-
|
|
84
|
+
let markov = new Markov(corpora, 5, triggers, 4);
|
|
85
|
+
export default markov;
|
package/src/lib/tarpit/tar.js
CHANGED
|
@@ -1,34 +1,49 @@
|
|
|
1
|
-
// see /docs/tar.md
|
|
1
|
+
// see /docs/tar.md
|
|
2
2
|
|
|
3
3
|
import markov from "./markov.js"
|
|
4
|
-
import
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
4
|
+
import { randomWord } from "./words/randomWord.js";
|
|
5
|
+
class Tar {
|
|
6
|
+
constructor(instanceRoot) {
|
|
7
|
+
this.instanceRoot = instanceRoot;
|
|
8
|
+
this.markov = markov;
|
|
9
|
+
}
|
|
10
|
+
_makeRoute() {
|
|
11
|
+
let length = Math.max(Math.floor(Math.random() * 10), 5);
|
|
12
|
+
let words = [];
|
|
13
|
+
for (let i = 0; i < length; i++) words.push(randomWord());
|
|
14
|
+
return words.join("/");
|
|
15
|
+
}
|
|
16
|
+
generate() {
|
|
17
|
+
let title = this.markov.generate(2);
|
|
18
|
+
let header = title;
|
|
19
|
+
let link = this.markov.generate(Math.floor(Math.random() * 10) + 1);
|
|
20
|
+
let content = this.markov.generate(50);
|
|
21
|
+
let scriptedcontent = markov.generate(50 + Math.floor(Math.random() * 10))
|
|
22
|
+
let meta = this.markov.generate(Math.floor(Math.random() * 10));
|
|
23
|
+
let next = this._makeRoute()
|
|
24
|
+
return `
|
|
25
|
+
<head>
|
|
26
|
+
<title>${title}</title>
|
|
27
|
+
<meta name="description" content="${meta}"></meta>
|
|
28
|
+
</head>
|
|
29
|
+
<body>
|
|
30
|
+
<h1>${header}</h1><br/>
|
|
31
|
+
<p>${content}</p>
|
|
32
|
+
<p id="real"></p>
|
|
33
|
+
<a href='${this.instanceRoot}${next}/'>${link}</a>
|
|
34
|
+
<script>
|
|
35
|
+
let result = 0;
|
|
36
|
+
for (let i = 0; i < 1000000; i++) {
|
|
37
|
+
result += Math.sqrt(Math.pow(Math.sin(i) * Math.cos(Math.sqrt(i)), Math.sqrt(2)));
|
|
38
|
+
result += Math.log(Math.abs(i) + 1) * Math.exp(Math.random());
|
|
39
|
+
}
|
|
40
|
+
console.log(result)
|
|
41
|
+
let y = new Array(400*2024*10).fill(0)
|
|
42
|
+
document.getElementById("real").innerText = "${scriptedcontent}"
|
|
43
|
+
</script>
|
|
44
|
+
</body>
|
|
45
|
+
`
|
|
46
|
+
}
|
|
32
47
|
}
|
|
33
48
|
|
|
34
|
-
export {
|
|
49
|
+
export { Tar };
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
class Graph {
|
|
2
|
+
constructor(dimensions = 512) {
|
|
3
|
+
this.map = new Map();
|
|
4
|
+
this.dimensions = dimensions;
|
|
5
|
+
}
|
|
6
|
+
set(value, cords) {
|
|
7
|
+
if (cords.length !== this.dimensions) {
|
|
8
|
+
console.error("Expected " + this.dimensions + " got " + cords.length);
|
|
9
|
+
throw Error();
|
|
10
|
+
}
|
|
11
|
+
this.map.set(value, cords);
|
|
12
|
+
}
|
|
13
|
+
get(value) {
|
|
14
|
+
return this.map.get(value);
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
distance(word1, word2) {
|
|
18
|
+
word1 = this.map.get(word1);
|
|
19
|
+
word2 = this.map.get(word2);
|
|
20
|
+
return Math.sqrt(
|
|
21
|
+
word1.reduce((sum, val, i) => sum + (val - word2[i]) ** 2, 0),
|
|
22
|
+
);
|
|
23
|
+
}
|
|
24
|
+
// word is a coordinate pair
|
|
25
|
+
nearest(word, noise) {
|
|
26
|
+
const target = word;
|
|
27
|
+
let distances = [];
|
|
28
|
+
|
|
29
|
+
for (const [other, vec] of this.map.entries()) {
|
|
30
|
+
const dist =
|
|
31
|
+
Math.sqrt(
|
|
32
|
+
target.reduce((sum, val, i) => sum + (val - vec[i]) ** 2, 0),
|
|
33
|
+
) +
|
|
34
|
+
Math.random() * 0.01 * noise; // distance formula
|
|
35
|
+
distances.push({ word: other, distance: dist });
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
distances.sort((a, b) => a.distance - b.distance);
|
|
39
|
+
return distances[0];
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
export default Graph;
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
function randomVector(dim) {
|
|
2
|
+
const v = new Float32Array(dim);
|
|
3
|
+
for (let i = 0; i < dim; i++) v[i] = (Math.random() - 0.5) * 0.01;
|
|
4
|
+
return v;
|
|
5
|
+
}
|
|
6
|
+
|
|
7
|
+
function sigmoid(x) {
|
|
8
|
+
// 6 or -6 are too wide of a range to be anything but flat
|
|
9
|
+
if (x > 6) return 1.0;
|
|
10
|
+
if (x < -6) return 0.0;
|
|
11
|
+
return 1 / (1 + Math.exp(-x));
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
// all of the code here is ripped from somewhere on the internet
|
|
15
|
+
// i can NOT do math
|
|
16
|
+
function train(
|
|
17
|
+
graph,
|
|
18
|
+
sentences,
|
|
19
|
+
epochs = 5,
|
|
20
|
+
lr = 0.025,
|
|
21
|
+
window = 5,
|
|
22
|
+
negSample = 5,
|
|
23
|
+
) {
|
|
24
|
+
let tokenizedSentences = [];
|
|
25
|
+
sentences.forEach((sentence) => {
|
|
26
|
+
let sentence_tokenized = sentence.split(/\s+/);
|
|
27
|
+
tokenizedSentences.push(sentence_tokenized);
|
|
28
|
+
for (let i = 0; i < sentence_tokenized.length; i++) {
|
|
29
|
+
let word = sentence_tokenized[i];
|
|
30
|
+
if (!graph.get(word)) {
|
|
31
|
+
graph.set(word, randomVector(graph.dimensions));
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
});
|
|
35
|
+
const words = Array.from(graph.map.keys());
|
|
36
|
+
|
|
37
|
+
for (let epoch = 0; epoch < epochs; epoch++) {
|
|
38
|
+
for (const sentence of tokenizedSentences) {
|
|
39
|
+
for (let i = 0; i < sentence.length; i++) {
|
|
40
|
+
const word = sentence[i];
|
|
41
|
+
const wordVector = graph.get(word);
|
|
42
|
+
|
|
43
|
+
const left = Math.max(0, i - window);
|
|
44
|
+
const right = Math.min(sentence.length - 1, i + window);
|
|
45
|
+
|
|
46
|
+
for (let j = left; j <= right; j++) {
|
|
47
|
+
if (j === i) continue;
|
|
48
|
+
|
|
49
|
+
const contextWord = sentence[j];
|
|
50
|
+
const vectorContext = graph.get(contextWord);
|
|
51
|
+
|
|
52
|
+
{
|
|
53
|
+
let dot = 0;
|
|
54
|
+
const D = graph.dimensions;
|
|
55
|
+
|
|
56
|
+
for (let d = 0; d < D; d++) dot += wordVector[d] * vectorContext[d];
|
|
57
|
+
const score = sigmoid(dot);
|
|
58
|
+
const grad = 1 - score; // label = 1
|
|
59
|
+
|
|
60
|
+
for (let d = 0; d < D; d++) {
|
|
61
|
+
const t = wordVector[d];
|
|
62
|
+
wordVector[d] += lr * grad * vectorContext[d];
|
|
63
|
+
vectorContext[d] += lr * grad * t;
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
for (let k = 0; k < negSample; k++) {
|
|
68
|
+
let negWord = words[(Math.random() * words.length) | 0];
|
|
69
|
+
if (negWord === word) continue;
|
|
70
|
+
|
|
71
|
+
const negVector = graph.get(negWord);
|
|
72
|
+
|
|
73
|
+
let dot = 0;
|
|
74
|
+
const D = graph.dimensions;
|
|
75
|
+
|
|
76
|
+
for (let d = 0; d < D; d++) dot += wordVector[d] * negVector[d];
|
|
77
|
+
const score = sigmoid(dot);
|
|
78
|
+
const grad = 0 - score; // label = 0
|
|
79
|
+
|
|
80
|
+
for (let d = 0; d < D; d++) {
|
|
81
|
+
const t = wordVector[d];
|
|
82
|
+
wordVector[d] += lr * grad * negVector[d];
|
|
83
|
+
negVector[d] += lr * grad * t;
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
console.log(`finished epoch ${epoch+1}/${epochs}`)
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
export default train;
|
|
@@ -1,12 +1,19 @@
|
|
|
1
|
-
import corpora from "../corpora.js"
|
|
2
|
-
import crypto from
|
|
1
|
+
import corpora from "../corpora.js";
|
|
2
|
+
import crypto from "crypto";
|
|
3
3
|
|
|
4
4
|
// remove all non a-z, lowercase, split into tokens
|
|
5
|
-
let wordList =
|
|
5
|
+
let wordList = Array.from(
|
|
6
|
+
new Set(
|
|
7
|
+
corpora
|
|
8
|
+
.replace(/[^a-zA-Z ]/g, "")
|
|
9
|
+
.toLowerCase()
|
|
10
|
+
.split(/\s+/)
|
|
11
|
+
)
|
|
12
|
+
)
|
|
13
|
+
|
|
6
14
|
function randomWord() {
|
|
7
|
-
|
|
8
|
-
|
|
15
|
+
const index = crypto.randomInt(0, wordList.length);
|
|
16
|
+
return wordList[index];
|
|
9
17
|
}
|
|
10
18
|
|
|
11
|
-
|
|
12
|
-
export {randomWord}
|
|
19
|
+
export { randomWord };
|
package/src/lib/venusRoot.js
CHANGED
|
@@ -1,18 +1,15 @@
|
|
|
1
1
|
// venus root path : the 32 char long random lowercase letters
|
|
2
2
|
class venusRoot {
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
this._venusRootCache = venusRoot
|
|
11
|
-
}
|
|
12
|
-
}
|
|
13
|
-
get path() {
|
|
14
|
-
return "/"+this._venusRootCache + "/"
|
|
3
|
+
constructor(venusRoot = "/") {
|
|
4
|
+
if (venusRoot === "") {
|
|
5
|
+
throw new URIError(`The Venus class must be done as following : \n
|
|
6
|
+
new Venus("/"), --> prefix becomes '/'
|
|
7
|
+
new Venus("/some/path/"), --> prefix becomes '/some/path/'
|
|
8
|
+
new Venus() --> prefix becomes '/'
|
|
9
|
+
`)
|
|
15
10
|
}
|
|
11
|
+
this.path = venusRoot;
|
|
12
|
+
}
|
|
16
13
|
}
|
|
17
14
|
|
|
18
|
-
export { venusRoot }
|
|
15
|
+
export { venusRoot };
|
package/src/venus.js
CHANGED
|
@@ -1,17 +1,31 @@
|
|
|
1
|
-
import { venusRoot } from
|
|
2
|
-
import {
|
|
3
|
-
import
|
|
1
|
+
import { venusRoot } from "./lib/venusRoot.js";
|
|
2
|
+
import { Tar } from "./lib/tarpit/tar.js";
|
|
3
|
+
import express from "express";
|
|
4
4
|
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
}
|
|
14
|
-
|
|
15
|
-
}
|
|
5
|
+
class Venus {
|
|
6
|
+
constructor(root = "UNSET") {
|
|
7
|
+
this.root = new venusRoot(root);
|
|
8
|
+
this.prefix = this.root.path;
|
|
9
|
+
this.tar = new Tar(this.root.path);
|
|
10
|
+
}
|
|
11
|
+
_rand() {
|
|
12
|
+
return Math.sqrt(Math.random() * 10) * 500;
|
|
13
|
+
}
|
|
14
|
+
route() {
|
|
15
|
+
const router = express.Router({ mergeParams: true });
|
|
16
|
+
console.log("path: " + `${this.root.path}`);
|
|
17
|
+
router.use((req, res) => {
|
|
18
|
+
|
|
19
|
+
setTimeout(() => {
|
|
20
|
+
res.send(this.tar.generate());
|
|
21
|
+
}, this._rand())
|
|
16
22
|
|
|
17
|
-
|
|
23
|
+
console.log(
|
|
24
|
+
`Creating tarpit for ${req.headers["user-agent"]}`,
|
|
25
|
+
);
|
|
26
|
+
});
|
|
27
|
+
return router;
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
export default Venus;
|
package/testing/index.js
CHANGED
|
@@ -1,13 +1,9 @@
|
|
|
1
|
-
import
|
|
1
|
+
import Venus from "./venus.js"
|
|
2
2
|
import express from 'express'
|
|
3
3
|
|
|
4
4
|
const app = express()
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
app.get("/", (req, res) =>{
|
|
9
|
-
res.send(`<html>Pretend this has some real page content <a href="${v}">some text</a> </html>`)
|
|
10
|
-
})
|
|
5
|
+
const venus = new Venus("/")
|
|
6
|
+
app.use(venus.prefix, venus.route())
|
|
11
7
|
|
|
12
8
|
app.listen(8080, () => {
|
|
13
9
|
console.log("Listening on port 8080")
|