text_rank 1.2.3 → 1.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.codeclimate.yml +1 -1
- data/.gitignore +4 -0
- data/.rubocop.yml +7 -0
- data/.ruby-version +1 -1
- data/.travis.yml +1 -0
- data/Rakefile +5 -0
- data/bin/console +3 -3
- data/ext/text_rank/extconf.rb +3 -0
- data/ext/text_rank/page_rank_sparse_native.c +300 -0
- data/ext/text_rank/page_rank_sparse_native.h +93 -0
- data/ext/text_rank/text_rank.c +5 -0
- data/lib/page_rank/base.rb +12 -9
- data/lib/page_rank/dense.rb +3 -2
- data/lib/page_rank/sparse.rb +6 -7
- data/lib/page_rank/sparse_native.rb +21 -0
- data/lib/page_rank.rb +7 -4
- data/lib/text_rank/char_filter/ascii_folding.rb +5 -1
- data/lib/text_rank/char_filter/strip_possessive.rb +2 -2
- data/lib/text_rank/char_filter/undo_contractions.rb +1 -137
- data/lib/text_rank/char_filter/undo_contractions.yml +135 -0
- data/lib/text_rank/char_filter.rb +1 -1
- data/lib/text_rank/fingerprint.rb +10 -18
- data/lib/text_rank/fingerprint_overlap.rb +55 -0
- data/lib/text_rank/graph_strategy/coocurrence.rb +15 -6
- data/lib/text_rank/keyword_extractor.rb +32 -25
- data/lib/text_rank/rank_filter/collapse_adjacent.rb +48 -25
- data/lib/text_rank/rank_filter/normalize_probability.rb +2 -1
- data/lib/text_rank/rank_filter/normalize_unit_vector.rb +2 -1
- data/lib/text_rank/token_filter/part_of_speech.rb +0 -1
- data/lib/text_rank/token_filter/stopwords.rb +1 -321
- data/lib/text_rank/token_filter/stopwords.yml +317 -0
- data/lib/text_rank/tokenizer/money.rb +11 -6
- data/lib/text_rank/tokenizer/number.rb +4 -3
- data/lib/text_rank/tokenizer/punctuation.rb +4 -1
- data/lib/text_rank/tokenizer/url.rb +3 -0
- data/lib/text_rank/tokenizer/whitespace.rb +4 -1
- data/lib/text_rank/tokenizer/word.rb +5 -2
- data/lib/text_rank/tokenizer.rb +1 -1
- data/lib/text_rank/version.rb +3 -1
- data/lib/text_rank.rb +14 -9
- data/text_rank.gemspec +4 -1
- metadata +48 -12
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5b25779e7c013e9d0c1d867324f58d40a062bacae0e38f92714e1d3fd7b0e7ef
|
4
|
+
data.tar.gz: 34c36b8ff6673092b2463b9f4e0fdaf94a55e50c3e52e4aeec125775c7fa3a9d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6f03e71745ed96077c63ed376303fcfaa8683f960319d71a405b943aa4a23383938c914b33c867f76f4d979505aeb1d5a0110b51dcc1eadab14cab41d6ee8697
|
7
|
+
data.tar.gz: edb17a0ee101254a5afc7c7ee5b084e11ce1d8bfd5083d5069a5e0841751a1f8f175fa167b788186b881e72a1114fa17a8cddc8c3328e00dda812a89b8647196
|
data/.codeclimate.yml
CHANGED
data/.gitignore
CHANGED
data/.rubocop.yml
CHANGED
@@ -15,6 +15,10 @@ Layout/EmptyLinesAroundModuleBody:
|
|
15
15
|
Layout/ExtraSpacing:
|
16
16
|
Enabled: false
|
17
17
|
|
18
|
+
Layout/HashAlignment:
|
19
|
+
EnforcedHashRocketStyle: table
|
20
|
+
EnforcedColonStyle: table
|
21
|
+
|
18
22
|
Layout/LineLength:
|
19
23
|
Max: 120
|
20
24
|
Enabled: false
|
@@ -89,6 +93,9 @@ Style/GuardClause:
|
|
89
93
|
Style/HashEachMethods:
|
90
94
|
Enabled: true
|
91
95
|
|
96
|
+
Style/HashSyntax:
|
97
|
+
Enabled: true
|
98
|
+
|
92
99
|
Style/HashTransformKeys:
|
93
100
|
Enabled: true
|
94
101
|
|
data/.ruby-version
CHANGED
@@ -1 +1 @@
|
|
1
|
-
ruby-
|
1
|
+
ruby-3.0.3
|
data/.travis.yml
CHANGED
@@ -9,6 +9,7 @@ before_script:
|
|
9
9
|
- curl -L https://codeclimate.com/downloads/test-reporter/test-reporter-latest-linux-amd64 > ./cc-test-reporter
|
10
10
|
- chmod +x ./cc-test-reporter
|
11
11
|
- ./cc-test-reporter before-build
|
12
|
+
- bundle exec rake compile
|
12
13
|
script:
|
13
14
|
- bundle exec rspec
|
14
15
|
after_script:
|
data/Rakefile
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
require "bundler/gem_tasks"
|
2
|
+
require "rake/extensiontask"
|
2
3
|
require "rspec/core/rake_task"
|
3
4
|
|
4
5
|
RSpec::Core::RakeTask.new(:spec)
|
@@ -10,3 +11,7 @@ RDoc::Task.new do |rdoc|
|
|
10
11
|
rdoc.main = "README.md"
|
11
12
|
rdoc.rdoc_files.include("README.md", "lib/**/*.rb")
|
12
13
|
end
|
14
|
+
|
15
|
+
Rake::ExtensionTask.new('text_rank') do |ext|
|
16
|
+
ext.lib_dir = 'lib/text_rank'
|
17
|
+
end
|
data/bin/console
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
-
require
|
4
|
-
require
|
3
|
+
require 'bundler/setup'
|
4
|
+
require 'text_rank'
|
5
5
|
|
6
6
|
# You can add fixtures and/or initialization code here to make experimenting
|
7
7
|
# with your gem easier. You can also use a different console, if you like.
|
@@ -10,5 +10,5 @@ require "text_rank"
|
|
10
10
|
# require "pry"
|
11
11
|
# Pry.start
|
12
12
|
|
13
|
-
require
|
13
|
+
require 'irb'
|
14
14
|
IRB.start
|
@@ -0,0 +1,300 @@
|
|
1
|
+
#include <ruby.h>
|
2
|
+
#include <math.h>
|
3
|
+
#include <page_rank_sparse_native.h>
|
4
|
+
|
5
|
+
const size_t NODE_LIST_SIZE = sizeof(NodeListStruct);
|
6
|
+
const size_t EDGE_LIST_SIZE = sizeof(EdgeListStruct);
|
7
|
+
const size_t NODE_SIZE = sizeof(NodeStruct);
|
8
|
+
const size_t EDGE_SIZE = sizeof(EdgeStruct);
|
9
|
+
const size_t GRAPH_SIZE = sizeof(GraphStruct);
|
10
|
+
|
11
|
+
static const rb_data_type_t graph_typed_data = {
|
12
|
+
"PageRank/SparseNative/Graph",
|
13
|
+
{ 0, free_graph, },
|
14
|
+
0, 0,
|
15
|
+
RUBY_TYPED_FREE_IMMEDIATELY,
|
16
|
+
};
|
17
|
+
|
18
|
+
|
19
|
+
//////////////////////////////////////////////////////////////////////////////////////
|
20
|
+
|
21
|
+
void Init_sparse_native() {
|
22
|
+
VALUE PageRankModule, SparseNativeClass;
|
23
|
+
|
24
|
+
PageRankModule = rb_const_get(rb_cObject, rb_intern("PageRank"));
|
25
|
+
SparseNativeClass = rb_const_get(PageRankModule, rb_intern("SparseNative"));
|
26
|
+
|
27
|
+
rb_define_alloc_func(SparseNativeClass, sparse_native_allocate);
|
28
|
+
rb_define_private_method(SparseNativeClass, "_add_edge", sparse_native_add_edge, 3);
|
29
|
+
rb_define_private_method(SparseNativeClass, "_calculate", sparse_native_calculate, 3);
|
30
|
+
}
|
31
|
+
|
32
|
+
VALUE sparse_native_allocate(VALUE self) {
|
33
|
+
Graph g = malloc(GRAPH_SIZE);
|
34
|
+
|
35
|
+
// Grab a reference to the hash type used by a generic Ruby {}
|
36
|
+
// which accepts any key and any value. We'll need this type to create
|
37
|
+
// a st_table in which to put arbitrary VALUE keys. This hash type
|
38
|
+
// should be a static constant and thus should be safe to utilize without
|
39
|
+
// fear of garbage collection.
|
40
|
+
const struct st_hash_type *objhash = rb_hash_tbl(rb_hash_new(), "page_rank_sparse_native.c", 40)->type;
|
41
|
+
|
42
|
+
g->node_count = 0;
|
43
|
+
g->nodes = NULL;
|
44
|
+
g->dangling_nodes = NULL;
|
45
|
+
g->node_lookup = st_init_table_with_size(objhash, 0);
|
46
|
+
|
47
|
+
return TypedData_Wrap_Struct(self, &graph_typed_data, g);
|
48
|
+
}
|
49
|
+
|
50
|
+
VALUE sparse_native_add_edge(VALUE self, VALUE source, VALUE dest, VALUE weight) {
|
51
|
+
Graph g;
|
52
|
+
|
53
|
+
TypedData_Get_Struct(self, GraphStruct, &graph_typed_data, g);
|
54
|
+
add_edge_with_labels(g, source, dest, NUM2DBL(weight));
|
55
|
+
return Qnil;
|
56
|
+
}
|
57
|
+
|
58
|
+
VALUE sparse_native_calculate(VALUE self, VALUE max_iterations, VALUE damping, VALUE tolerance) {
|
59
|
+
Graph g;
|
60
|
+
VALUE ranks;
|
61
|
+
|
62
|
+
TypedData_Get_Struct(self, GraphStruct, &graph_typed_data, g);
|
63
|
+
calculate(g, FIX2INT(max_iterations), NUM2DBL(damping), NUM2DBL(tolerance));
|
64
|
+
|
65
|
+
ranks = rb_hash_new();
|
66
|
+
sort_and_normalize_ranks(g, rb_hash_dset, ranks);
|
67
|
+
return ranks;
|
68
|
+
}
|
69
|
+
|
70
|
+
void rb_hash_dset(VALUE hash, VALUE key, double value) {
|
71
|
+
rb_hash_aset(hash, key, DBL2NUM(value));
|
72
|
+
}
|
73
|
+
|
74
|
+
//////////////////////////////////////////////////////////////////////////////////////
|
75
|
+
|
76
|
+
void free_graph(void *data) {
|
77
|
+
Graph g = (Graph)data;
|
78
|
+
free_node_list(g->nodes, free_node);
|
79
|
+
free_node_list(g->dangling_nodes, NULL);
|
80
|
+
free(g->node_lookup);
|
81
|
+
free(g);
|
82
|
+
}
|
83
|
+
|
84
|
+
void free_node(Node n) {
|
85
|
+
free_edge_list(n->source_edges, free_edge);
|
86
|
+
free(n);
|
87
|
+
}
|
88
|
+
|
89
|
+
void free_node_list(NodeList nodes, void (*free_item)(Node)) {
|
90
|
+
NodeList tmp;
|
91
|
+
while (nodes != NULL) {
|
92
|
+
tmp = nodes;
|
93
|
+
nodes = nodes->next;
|
94
|
+
if (free_item) {
|
95
|
+
free_item(tmp->node);
|
96
|
+
}
|
97
|
+
free(tmp);
|
98
|
+
}
|
99
|
+
}
|
100
|
+
|
101
|
+
void free_edge(Edge e) {
|
102
|
+
// Assume source node was allocated elsewhere and will be free'd elsewhere
|
103
|
+
free(e);
|
104
|
+
}
|
105
|
+
|
106
|
+
void free_edge_list(EdgeList edges, void (*free_item)(Edge)) {
|
107
|
+
EdgeList tmp;
|
108
|
+
while (edges != NULL) {
|
109
|
+
tmp = edges;
|
110
|
+
edges = edges->next;
|
111
|
+
if (free_item) {
|
112
|
+
free_item(tmp->edge);
|
113
|
+
}
|
114
|
+
free(tmp);
|
115
|
+
}
|
116
|
+
}
|
117
|
+
|
118
|
+
//////////////////////////////////////////////////////////////////////////////////////
|
119
|
+
|
120
|
+
Node add_node(Graph g, VALUE label) {
|
121
|
+
NodeList tmp = malloc(NODE_LIST_SIZE);
|
122
|
+
|
123
|
+
tmp->node = malloc(NODE_SIZE);
|
124
|
+
tmp->node->label = label;
|
125
|
+
tmp->node->source_edges = NULL;
|
126
|
+
tmp->node->rank = 0.0;
|
127
|
+
tmp->node->prev_rank = 0.0;
|
128
|
+
tmp->node->outbound_weight_total = 0.0;
|
129
|
+
|
130
|
+
tmp->next = g->nodes;
|
131
|
+
g->nodes = tmp;
|
132
|
+
g->node_count += 1;
|
133
|
+
|
134
|
+
return tmp->node;
|
135
|
+
}
|
136
|
+
|
137
|
+
Node add_dangling_node(Graph g, Node n) {
|
138
|
+
NodeList tmp = malloc(NODE_LIST_SIZE);
|
139
|
+
|
140
|
+
tmp->node = n;
|
141
|
+
tmp->next = g->dangling_nodes;
|
142
|
+
g->dangling_nodes = tmp;
|
143
|
+
|
144
|
+
return n;
|
145
|
+
}
|
146
|
+
|
147
|
+
Edge add_edge(Node source, Node destination, double weight) {
|
148
|
+
EdgeList tmp = malloc(EDGE_LIST_SIZE);
|
149
|
+
|
150
|
+
tmp->edge = malloc(EDGE_SIZE);
|
151
|
+
tmp->edge->source = source;
|
152
|
+
tmp->edge->weight = weight;
|
153
|
+
|
154
|
+
tmp->next = destination->source_edges;
|
155
|
+
destination->source_edges = tmp;
|
156
|
+
source->outbound_weight_total += weight;
|
157
|
+
|
158
|
+
return tmp->edge;
|
159
|
+
}
|
160
|
+
|
161
|
+
Edge add_edge_with_labels(Graph g, VALUE source_label, VALUE dest_label, double weight) {
|
162
|
+
Node source, dest;
|
163
|
+
|
164
|
+
source = lookup_node(g, source_label);
|
165
|
+
dest = lookup_node(g, dest_label);
|
166
|
+
|
167
|
+
return add_edge(source, dest, weight);
|
168
|
+
}
|
169
|
+
|
170
|
+
Node lookup_node(Graph g, VALUE label) {
|
171
|
+
Node n;
|
172
|
+
|
173
|
+
if (!st_lookup(g->node_lookup, (st_data_t)label, (st_data_t *)&n)) {
|
174
|
+
n = add_node(g, label);
|
175
|
+
st_add_direct(g->node_lookup, (st_data_t)label, (st_data_t)n);
|
176
|
+
}
|
177
|
+
return n;
|
178
|
+
}
|
179
|
+
|
180
|
+
//////////////////////////////////////////////////////////////////////////////////////
|
181
|
+
|
182
|
+
void calculate_start(Graph g) {
|
183
|
+
NodeList nodes;
|
184
|
+
Node source, destination;
|
185
|
+
EdgeList edges;
|
186
|
+
Edge e;
|
187
|
+
|
188
|
+
for (nodes = g->nodes; nodes != NULL; nodes = nodes->next) {
|
189
|
+
destination = nodes->node;
|
190
|
+
|
191
|
+
// If there is no outband, this is a "dangling" node
|
192
|
+
if (destination->outbound_weight_total == 0.0) {
|
193
|
+
add_dangling_node(g, destination);
|
194
|
+
}
|
195
|
+
|
196
|
+
// Normalize all source edge weights
|
197
|
+
for (edges = destination->source_edges; edges != NULL; edges = edges->next) {
|
198
|
+
e = edges->edge;
|
199
|
+
source = e->source;
|
200
|
+
e->weight = e->weight / source->outbound_weight_total;
|
201
|
+
}
|
202
|
+
|
203
|
+
// Set the initial rank
|
204
|
+
destination->prev_rank = 0;
|
205
|
+
destination->rank = 1.0 / g->node_count;
|
206
|
+
}
|
207
|
+
}
|
208
|
+
|
209
|
+
void calculate_step(Graph g, double damping) {
|
210
|
+
NodeList nodes, dangling_nodes;
|
211
|
+
Node source, destination;
|
212
|
+
EdgeList edges;
|
213
|
+
Edge e;
|
214
|
+
double sum;
|
215
|
+
|
216
|
+
// Set prev rank to rank for all nodes
|
217
|
+
for (nodes = g->nodes; nodes != NULL; nodes = nodes->next) {
|
218
|
+
destination = nodes->node;
|
219
|
+
destination->prev_rank = destination->rank;
|
220
|
+
}
|
221
|
+
|
222
|
+
// Re-destribute the rankings according to weight
|
223
|
+
for (nodes = g->nodes; nodes != NULL; nodes = nodes->next) {
|
224
|
+
destination = nodes->node;
|
225
|
+
sum = 0.0;
|
226
|
+
for (edges = destination->source_edges; edges != NULL; edges = edges->next) {
|
227
|
+
e = edges->edge;
|
228
|
+
source = e->source;
|
229
|
+
sum += source->prev_rank * e->weight;
|
230
|
+
}
|
231
|
+
for (dangling_nodes = g->dangling_nodes; dangling_nodes != NULL; dangling_nodes = dangling_nodes->next) {
|
232
|
+
source = dangling_nodes->node;
|
233
|
+
sum += source->prev_rank / g->node_count;
|
234
|
+
}
|
235
|
+
destination->rank = damping * sum + (1 - damping) / g->node_count;
|
236
|
+
}
|
237
|
+
}
|
238
|
+
|
239
|
+
// Calculate the Euclidean distance from prev_rank to rank across all nodes
|
240
|
+
double prev_distance(Graph g) {
|
241
|
+
NodeList nodes;
|
242
|
+
Node n;
|
243
|
+
double rank_diff, sum_squares = 0.0;
|
244
|
+
|
245
|
+
for (nodes = g->nodes; nodes != NULL; nodes = nodes->next) {
|
246
|
+
n = nodes->node;
|
247
|
+
rank_diff = n->prev_rank - n->rank;
|
248
|
+
sum_squares += rank_diff * rank_diff;
|
249
|
+
}
|
250
|
+
|
251
|
+
return sqrt(sum_squares);
|
252
|
+
}
|
253
|
+
|
254
|
+
void calculate(Graph g, int max_iterations, double damping, double tolerance) {
|
255
|
+
calculate_start(g);
|
256
|
+
|
257
|
+
while (max_iterations != 0) { // If negative one, allow to go without limit
|
258
|
+
calculate_step(g, damping);
|
259
|
+
if (prev_distance(g) < tolerance) {
|
260
|
+
break;
|
261
|
+
}
|
262
|
+
max_iterations--;
|
263
|
+
}
|
264
|
+
}
|
265
|
+
|
266
|
+
int node_compare(const void *v1, const void *v2) {
|
267
|
+
double rank1, rank2, cmp;
|
268
|
+
|
269
|
+
rank1 = (*(Node *)v1)->rank;
|
270
|
+
rank2 = (*(Node *)v2)->rank;
|
271
|
+
cmp = rank2 - rank1; // Decreasing order
|
272
|
+
if (cmp < 0) return -1;
|
273
|
+
if (cmp > 0) return 1;
|
274
|
+
return 0;
|
275
|
+
}
|
276
|
+
|
277
|
+
void sort_and_normalize_ranks(Graph g, void (*callback)(VALUE, VALUE, double), VALUE callback_arg) {
|
278
|
+
NodeList nodes;
|
279
|
+
Node n;
|
280
|
+
double sum = 0.0;
|
281
|
+
unsigned long i;
|
282
|
+
Node *tmp;
|
283
|
+
|
284
|
+
i = g->node_count;
|
285
|
+
tmp = malloc(g->node_count * sizeof(Node));
|
286
|
+
for (nodes = g->nodes; nodes != NULL; nodes = nodes->next) {
|
287
|
+
n = nodes->node;
|
288
|
+
tmp[--i] = n;
|
289
|
+
sum += n->rank;
|
290
|
+
}
|
291
|
+
|
292
|
+
qsort(tmp, g->node_count, sizeof(Node), node_compare);
|
293
|
+
|
294
|
+
for (i = 0; i < g->node_count; i++) {
|
295
|
+
n = tmp[i];
|
296
|
+
callback(callback_arg, n->label, n->rank / sum);
|
297
|
+
}
|
298
|
+
|
299
|
+
free(tmp);
|
300
|
+
}
|
@@ -0,0 +1,93 @@
|
|
1
|
+
#ifndef PAGE_RANK_SPARSE_NATIVE_H
|
2
|
+
#define PAGE_RANK_SPARSE_NATIVE_H
|
3
|
+
|
4
|
+
#include <ruby.h>
|
5
|
+
|
6
|
+
struct NodeListStruct;
|
7
|
+
typedef struct NodeListStruct* NodeList;
|
8
|
+
|
9
|
+
typedef struct NodeListStruct {
|
10
|
+
struct NodeStruct *node;
|
11
|
+
struct NodeListStruct *next;
|
12
|
+
} NodeListStruct;
|
13
|
+
|
14
|
+
//////////////////////////////////////////////////////////////////////////////////////
|
15
|
+
|
16
|
+
struct EdgeListStruct;
|
17
|
+
typedef struct EdgeListStruct* EdgeList;
|
18
|
+
|
19
|
+
typedef struct EdgeListStruct {
|
20
|
+
struct EdgeStruct *edge;
|
21
|
+
struct EdgeListStruct *next;
|
22
|
+
} EdgeListStruct;
|
23
|
+
|
24
|
+
//////////////////////////////////////////////////////////////////////////////////////
|
25
|
+
|
26
|
+
struct NodeStruct;
|
27
|
+
typedef struct NodeStruct* Node;
|
28
|
+
|
29
|
+
typedef struct NodeStruct {
|
30
|
+
EdgeList source_edges;
|
31
|
+
VALUE label;
|
32
|
+
double prev_rank;
|
33
|
+
double rank;
|
34
|
+
double outbound_weight_total;
|
35
|
+
} NodeStruct;
|
36
|
+
|
37
|
+
//////////////////////////////////////////////////////////////////////////////////////
|
38
|
+
|
39
|
+
struct EdgeStruct;
|
40
|
+
typedef struct EdgeStruct* Edge;
|
41
|
+
|
42
|
+
typedef struct EdgeStruct {
|
43
|
+
Node source;
|
44
|
+
double weight;
|
45
|
+
} EdgeStruct;
|
46
|
+
|
47
|
+
//////////////////////////////////////////////////////////////////////////////////////
|
48
|
+
|
49
|
+
struct GraphStruct;
|
50
|
+
typedef struct GraphStruct* Graph;
|
51
|
+
|
52
|
+
typedef struct GraphStruct {
|
53
|
+
unsigned long node_count;
|
54
|
+
NodeList nodes;
|
55
|
+
NodeList dangling_nodes;
|
56
|
+
st_table *node_lookup;
|
57
|
+
} GraphStruct;
|
58
|
+
|
59
|
+
//////////////////////////////////////////////////////////////////////////////////////
|
60
|
+
|
61
|
+
void free_graph(void *data);
|
62
|
+
void free_node(Node n);
|
63
|
+
void free_node_list(NodeList nodes, void (*free_item)(Node));
|
64
|
+
void free_edge(Edge e);
|
65
|
+
void free_edge_list(EdgeList edges, void (*free_item)(Edge));
|
66
|
+
|
67
|
+
//////////////////////////////////////////////////////////////////////////////////////
|
68
|
+
|
69
|
+
Node add_node(Graph g, VALUE label);
|
70
|
+
Node add_dangling_node(Graph g, Node n);
|
71
|
+
Edge add_edge(Node source, Node destination, double weight);
|
72
|
+
Edge add_edge_with_labels(Graph g, VALUE source_label, VALUE dest_label, double weight);
|
73
|
+
Node lookup_node(Graph g, VALUE label);
|
74
|
+
|
75
|
+
//////////////////////////////////////////////////////////////////////////////////////
|
76
|
+
|
77
|
+
void calculate_start(Graph g);
|
78
|
+
void calculate_step(Graph g, double damping);
|
79
|
+
double prev_distance(Graph g);
|
80
|
+
void calculate(Graph g, int max_iterations, double damping, double tolerance);
|
81
|
+
int node_compare(const void *v1, const void *v2);
|
82
|
+
void sort_and_normalize_ranks(Graph g, void (*callback)(VALUE, VALUE, double), VALUE callback_arg);
|
83
|
+
|
84
|
+
//////////////////////////////////////////////////////////////////////////////////////
|
85
|
+
|
86
|
+
void Init_sparse_native();
|
87
|
+
VALUE sparse_native_allocate(VALUE self);
|
88
|
+
VALUE sparse_native_add_edge(VALUE self, VALUE source, VALUE dest, VALUE weight);
|
89
|
+
VALUE sparse_native_calculate(VALUE self, VALUE max_iterations, VALUE damping, VALUE tolerance);
|
90
|
+
VALUE sorted_and_normalized_ranks(Graph g);
|
91
|
+
void rb_hash_dset(VALUE hash, VALUE key, double value);
|
92
|
+
|
93
|
+
#endif
|
data/lib/page_rank/base.rb
CHANGED
@@ -7,6 +7,8 @@ module PageRank
|
|
7
7
|
##
|
8
8
|
class Base
|
9
9
|
|
10
|
+
attr_reader :damping, :tolerance
|
11
|
+
|
10
12
|
# @param (see #damping=)
|
11
13
|
# @param (see #tolerance=)
|
12
14
|
def initialize(damping: nil, tolerance: nil, **_)
|
@@ -19,8 +21,7 @@ module PageRank
|
|
19
21
|
# @return [Float]
|
20
22
|
def damping=(damping)
|
21
23
|
@damping = damping || 0.85
|
22
|
-
raise ArgumentError
|
23
|
-
@damping
|
24
|
+
raise ArgumentError, 'Invalid damping factor' if @damping <= 0 || @damping > 1
|
24
25
|
end
|
25
26
|
|
26
27
|
# Set the tolerance value
|
@@ -28,8 +29,7 @@ module PageRank
|
|
28
29
|
# @return [Float]
|
29
30
|
def tolerance=(tolerance)
|
30
31
|
@tolerance = tolerance || 0.0001
|
31
|
-
raise ArgumentError
|
32
|
-
@tolerance
|
32
|
+
raise ArgumentError, 'Invalid tolerance factor' if @tolerance.negative? || @tolerance > 1
|
33
33
|
end
|
34
34
|
|
35
35
|
# Adds a directed (and optionally weighted) edge to the graph
|
@@ -46,9 +46,12 @@ module PageRank
|
|
46
46
|
def calculate(max_iterations: -1, **_)
|
47
47
|
ranks = initial_ranks
|
48
48
|
loop do
|
49
|
-
break if max_iterations
|
50
|
-
|
51
|
-
|
49
|
+
break if max_iterations.zero?
|
50
|
+
|
51
|
+
prev_ranks = ranks
|
52
|
+
ranks = calculate_step(ranks)
|
53
|
+
break if distance(ranks, prev_ranks) < tolerance
|
54
|
+
|
52
55
|
max_iterations -= 1
|
53
56
|
end
|
54
57
|
sort_ranks(ranks)
|
@@ -77,9 +80,9 @@ module PageRank
|
|
77
80
|
end
|
78
81
|
|
79
82
|
# Calculate the Euclidean distance from one ranking to the next iteration
|
80
|
-
def distance(
|
83
|
+
def distance(vector1, vector2)
|
81
84
|
sum_squares = node_count.times.reduce(0.0) do |sum, i|
|
82
|
-
d =
|
85
|
+
d = vector1[i] - vector2[i]
|
83
86
|
sum + d * d
|
84
87
|
end
|
85
88
|
Math.sqrt(sum_squares)
|
data/lib/page_rank/dense.rb
CHANGED
@@ -32,6 +32,7 @@ module PageRank
|
|
32
32
|
# @return (see Base#add)
|
33
33
|
def add(source, dest, weight: 1.0)
|
34
34
|
return if source == dest
|
35
|
+
|
35
36
|
source_idx = index(source)
|
36
37
|
dest_idx = index(dest)
|
37
38
|
@out_links[source_idx] ||= []
|
@@ -72,13 +73,13 @@ module PageRank
|
|
72
73
|
|
73
74
|
def to_matrix
|
74
75
|
total_out_weights = @out_links.map do |links|
|
75
|
-
links
|
76
|
+
links&.compact&.reduce(:+)
|
76
77
|
end
|
77
78
|
Matrix.build(node_count, node_count) do |dest_idx, source_idx|
|
78
79
|
total = total_out_weights[source_idx]
|
79
80
|
if total
|
80
81
|
w = @out_links[source_idx][dest_idx] || 0.0
|
81
|
-
|
82
|
+
damping * w / total + (1 - damping) / node_count.to_f
|
82
83
|
else
|
83
84
|
1.0 / node_count.to_f
|
84
85
|
end
|
data/lib/page_rank/sparse.rb
CHANGED
@@ -1,5 +1,3 @@
|
|
1
|
-
require 'set'
|
2
|
-
|
3
1
|
module PageRank
|
4
2
|
##
|
5
3
|
# Implementation of PageRank using a sparse matrix representation of the graph
|
@@ -33,6 +31,7 @@ module PageRank
|
|
33
31
|
# @return (see Base#add)
|
34
32
|
def add(source, dest, weight: 1.0)
|
35
33
|
return false if source == dest
|
34
|
+
|
36
35
|
@graph[dest] ||= Set.new
|
37
36
|
@graph[dest] << source
|
38
37
|
@weights[source] ||= Hash.new(0.0)
|
@@ -53,8 +52,8 @@ module PageRank
|
|
53
52
|
def initial_ranks
|
54
53
|
@dangling_nodes = @nodes - @weight_totals.keys
|
55
54
|
@normalized_weights = @weights.each_with_object({}) do |(source, values), h|
|
56
|
-
h[source] = values.
|
57
|
-
|
55
|
+
h[source] = values.transform_values do |w|
|
56
|
+
w / @weight_totals[source]
|
58
57
|
end
|
59
58
|
end
|
60
59
|
Hash[@nodes.map { |k| [k, 1.0 / node_count.to_f] }]
|
@@ -69,7 +68,7 @@ module PageRank
|
|
69
68
|
@dangling_nodes.each do |source|
|
70
69
|
sum += ranks[source] / node_count.to_f
|
71
70
|
end
|
72
|
-
new_ranks[dest] =
|
71
|
+
new_ranks[dest] = damping * sum + (1 - damping) / node_count
|
73
72
|
end
|
74
73
|
end
|
75
74
|
|
@@ -79,8 +78,8 @@ module PageRank
|
|
79
78
|
Hash[ranks.map { |k, v| [k, v / sum] }.sort_by { |_, v| -v }]
|
80
79
|
end
|
81
80
|
|
82
|
-
def distance(
|
83
|
-
super(
|
81
|
+
def distance(vector1, vector2)
|
82
|
+
super(vector1.values.to_a, vector2.values.to_a)
|
84
83
|
end
|
85
84
|
|
86
85
|
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module PageRank
|
2
|
+
class SparseNative < Base
|
3
|
+
|
4
|
+
#require 'page_rank/sparse_native.so'
|
5
|
+
|
6
|
+
# @param (see Base#add)
|
7
|
+
# @param weight [Float] Optional weight for the graph edge
|
8
|
+
# @return (see Base#add)
|
9
|
+
def add(source, dest, weight: 1.0)
|
10
|
+
_add_edge(source, dest, weight) unless source == dest
|
11
|
+
end
|
12
|
+
|
13
|
+
# Perform the PageRank calculation
|
14
|
+
# @param max_iterations [Fixnum] Maximum number of PageRank iterations to perform (or -1 for no max)
|
15
|
+
# @return [Hash<Object, Float>] of nodes with rank
|
16
|
+
def calculate(max_iterations: -1, **_)
|
17
|
+
_calculate(max_iterations, damping, tolerance)
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
21
|
+
end
|
data/lib/page_rank.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
require 'set'
|
2
|
+
|
1
3
|
##
|
2
4
|
# A module for supporting Ruby implementations of PageRank. Rather than rely on
|
3
5
|
# one single implementation, this module allows for multiple implementations that
|
@@ -15,16 +17,17 @@
|
|
15
17
|
##
|
16
18
|
module PageRank
|
17
19
|
|
18
|
-
autoload :Base,
|
19
|
-
autoload :Dense,
|
20
|
-
autoload :Sparse,
|
20
|
+
autoload :Base, 'page_rank/base'
|
21
|
+
autoload :Dense, 'page_rank/dense'
|
22
|
+
autoload :Sparse, 'page_rank/sparse'
|
23
|
+
autoload :SparseNative, 'page_rank/sparse_native'
|
21
24
|
|
22
25
|
# @option options [Symbol] :strategy PageRank strategy to use (either :sparse or :dense)
|
23
26
|
# @option options [Float] :damping The probability of following the graph vs. randomly choosing a new node
|
24
27
|
# @option options [Float] :tolerance The desired accuracy of the results
|
25
28
|
# @return [PageRank::Base]
|
26
29
|
def self.new(strategy: :sparse, **options)
|
27
|
-
const_get(strategy.to_s.capitalize).new(**options)
|
30
|
+
const_get(strategy.to_s.split('_').map(&:capitalize).join).new(**options)
|
28
31
|
end
|
29
32
|
|
30
33
|
# Convenience method to quickly calculate PageRank. In the calling block, graph edges can be added.
|