text_rank 1.2.3 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.codeclimate.yml +1 -1
- data/.gitignore +4 -0
- data/.rubocop.yml +7 -0
- data/.ruby-version +1 -1
- data/.travis.yml +1 -0
- data/Rakefile +5 -0
- data/bin/console +3 -3
- data/ext/text_rank/extconf.rb +3 -0
- data/ext/text_rank/page_rank_sparse_native.c +300 -0
- data/ext/text_rank/page_rank_sparse_native.h +93 -0
- data/ext/text_rank/text_rank.c +5 -0
- data/lib/page_rank/base.rb +12 -9
- data/lib/page_rank/dense.rb +3 -2
- data/lib/page_rank/sparse.rb +6 -7
- data/lib/page_rank/sparse_native.rb +21 -0
- data/lib/page_rank.rb +7 -4
- data/lib/text_rank/char_filter/ascii_folding.rb +5 -1
- data/lib/text_rank/char_filter/strip_possessive.rb +2 -2
- data/lib/text_rank/char_filter/undo_contractions.rb +1 -137
- data/lib/text_rank/char_filter/undo_contractions.yml +135 -0
- data/lib/text_rank/char_filter.rb +1 -1
- data/lib/text_rank/fingerprint.rb +10 -18
- data/lib/text_rank/fingerprint_overlap.rb +55 -0
- data/lib/text_rank/graph_strategy/coocurrence.rb +15 -6
- data/lib/text_rank/keyword_extractor.rb +32 -25
- data/lib/text_rank/rank_filter/collapse_adjacent.rb +48 -25
- data/lib/text_rank/rank_filter/normalize_probability.rb +2 -1
- data/lib/text_rank/rank_filter/normalize_unit_vector.rb +2 -1
- data/lib/text_rank/token_filter/part_of_speech.rb +0 -1
- data/lib/text_rank/token_filter/stopwords.rb +1 -321
- data/lib/text_rank/token_filter/stopwords.yml +317 -0
- data/lib/text_rank/tokenizer/money.rb +11 -6
- data/lib/text_rank/tokenizer/number.rb +4 -3
- data/lib/text_rank/tokenizer/punctuation.rb +4 -1
- data/lib/text_rank/tokenizer/url.rb +3 -0
- data/lib/text_rank/tokenizer/whitespace.rb +4 -1
- data/lib/text_rank/tokenizer/word.rb +5 -2
- data/lib/text_rank/tokenizer.rb +1 -1
- data/lib/text_rank/version.rb +3 -1
- data/lib/text_rank.rb +14 -9
- data/text_rank.gemspec +4 -1
- metadata +48 -12
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5b25779e7c013e9d0c1d867324f58d40a062bacae0e38f92714e1d3fd7b0e7ef
|
4
|
+
data.tar.gz: 34c36b8ff6673092b2463b9f4e0fdaf94a55e50c3e52e4aeec125775c7fa3a9d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6f03e71745ed96077c63ed376303fcfaa8683f960319d71a405b943aa4a23383938c914b33c867f76f4d979505aeb1d5a0110b51dcc1eadab14cab41d6ee8697
|
7
|
+
data.tar.gz: edb17a0ee101254a5afc7c7ee5b084e11ce1d8bfd5083d5069a5e0841751a1f8f175fa167b788186b881e72a1114fa17a8cddc8c3328e00dda812a89b8647196
|
data/.codeclimate.yml
CHANGED
data/.gitignore
CHANGED
data/.rubocop.yml
CHANGED
@@ -15,6 +15,10 @@ Layout/EmptyLinesAroundModuleBody:
|
|
15
15
|
Layout/ExtraSpacing:
|
16
16
|
Enabled: false
|
17
17
|
|
18
|
+
Layout/HashAlignment:
|
19
|
+
EnforcedHashRocketStyle: table
|
20
|
+
EnforcedColonStyle: table
|
21
|
+
|
18
22
|
Layout/LineLength:
|
19
23
|
Max: 120
|
20
24
|
Enabled: false
|
@@ -89,6 +93,9 @@ Style/GuardClause:
|
|
89
93
|
Style/HashEachMethods:
|
90
94
|
Enabled: true
|
91
95
|
|
96
|
+
Style/HashSyntax:
|
97
|
+
Enabled: true
|
98
|
+
|
92
99
|
Style/HashTransformKeys:
|
93
100
|
Enabled: true
|
94
101
|
|
data/.ruby-version
CHANGED
@@ -1 +1 @@
|
|
1
|
-
ruby-
|
1
|
+
ruby-3.0.3
|
data/.travis.yml
CHANGED
@@ -9,6 +9,7 @@ before_script:
|
|
9
9
|
- curl -L https://codeclimate.com/downloads/test-reporter/test-reporter-latest-linux-amd64 > ./cc-test-reporter
|
10
10
|
- chmod +x ./cc-test-reporter
|
11
11
|
- ./cc-test-reporter before-build
|
12
|
+
- bundle exec rake compile
|
12
13
|
script:
|
13
14
|
- bundle exec rspec
|
14
15
|
after_script:
|
data/Rakefile
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
require "bundler/gem_tasks"
|
2
|
+
require "rake/extensiontask"
|
2
3
|
require "rspec/core/rake_task"
|
3
4
|
|
4
5
|
RSpec::Core::RakeTask.new(:spec)
|
@@ -10,3 +11,7 @@ RDoc::Task.new do |rdoc|
|
|
10
11
|
rdoc.main = "README.md"
|
11
12
|
rdoc.rdoc_files.include("README.md", "lib/**/*.rb")
|
12
13
|
end
|
14
|
+
|
15
|
+
Rake::ExtensionTask.new('text_rank') do |ext|
|
16
|
+
ext.lib_dir = 'lib/text_rank'
|
17
|
+
end
|
data/bin/console
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
-
require
|
4
|
-
require
|
3
|
+
require 'bundler/setup'
|
4
|
+
require 'text_rank'
|
5
5
|
|
6
6
|
# You can add fixtures and/or initialization code here to make experimenting
|
7
7
|
# with your gem easier. You can also use a different console, if you like.
|
@@ -10,5 +10,5 @@ require "text_rank"
|
|
10
10
|
# require "pry"
|
11
11
|
# Pry.start
|
12
12
|
|
13
|
-
require
|
13
|
+
require 'irb'
|
14
14
|
IRB.start
|
@@ -0,0 +1,300 @@
|
|
1
|
+
#include <ruby.h>
|
2
|
+
#include <math.h>
|
3
|
+
#include <page_rank_sparse_native.h>
|
4
|
+
|
5
|
+
const size_t NODE_LIST_SIZE = sizeof(NodeListStruct);
|
6
|
+
const size_t EDGE_LIST_SIZE = sizeof(EdgeListStruct);
|
7
|
+
const size_t NODE_SIZE = sizeof(NodeStruct);
|
8
|
+
const size_t EDGE_SIZE = sizeof(EdgeStruct);
|
9
|
+
const size_t GRAPH_SIZE = sizeof(GraphStruct);
|
10
|
+
|
11
|
+
static const rb_data_type_t graph_typed_data = {
|
12
|
+
"PageRank/SparseNative/Graph",
|
13
|
+
{ 0, free_graph, },
|
14
|
+
0, 0,
|
15
|
+
RUBY_TYPED_FREE_IMMEDIATELY,
|
16
|
+
};
|
17
|
+
|
18
|
+
|
19
|
+
//////////////////////////////////////////////////////////////////////////////////////
|
20
|
+
|
21
|
+
void Init_sparse_native() {
|
22
|
+
VALUE PageRankModule, SparseNativeClass;
|
23
|
+
|
24
|
+
PageRankModule = rb_const_get(rb_cObject, rb_intern("PageRank"));
|
25
|
+
SparseNativeClass = rb_const_get(PageRankModule, rb_intern("SparseNative"));
|
26
|
+
|
27
|
+
rb_define_alloc_func(SparseNativeClass, sparse_native_allocate);
|
28
|
+
rb_define_private_method(SparseNativeClass, "_add_edge", sparse_native_add_edge, 3);
|
29
|
+
rb_define_private_method(SparseNativeClass, "_calculate", sparse_native_calculate, 3);
|
30
|
+
}
|
31
|
+
|
32
|
+
VALUE sparse_native_allocate(VALUE self) {
|
33
|
+
Graph g = malloc(GRAPH_SIZE);
|
34
|
+
|
35
|
+
// Grab a reference to the hash type used by a generic Ruby {}
|
36
|
+
// which accepts any key and any value. We'll need this type to create
|
37
|
+
// a st_table in which to put arbitrary VALUE keys. This hash type
|
38
|
+
// should be a static constant and thus should be safe to utilize without
|
39
|
+
// fear of garbage collection.
|
40
|
+
const struct st_hash_type *objhash = rb_hash_tbl(rb_hash_new(), "page_rank_sparse_native.c", 40)->type;
|
41
|
+
|
42
|
+
g->node_count = 0;
|
43
|
+
g->nodes = NULL;
|
44
|
+
g->dangling_nodes = NULL;
|
45
|
+
g->node_lookup = st_init_table_with_size(objhash, 0);
|
46
|
+
|
47
|
+
return TypedData_Wrap_Struct(self, &graph_typed_data, g);
|
48
|
+
}
|
49
|
+
|
50
|
+
VALUE sparse_native_add_edge(VALUE self, VALUE source, VALUE dest, VALUE weight) {
|
51
|
+
Graph g;
|
52
|
+
|
53
|
+
TypedData_Get_Struct(self, GraphStruct, &graph_typed_data, g);
|
54
|
+
add_edge_with_labels(g, source, dest, NUM2DBL(weight));
|
55
|
+
return Qnil;
|
56
|
+
}
|
57
|
+
|
58
|
+
VALUE sparse_native_calculate(VALUE self, VALUE max_iterations, VALUE damping, VALUE tolerance) {
|
59
|
+
Graph g;
|
60
|
+
VALUE ranks;
|
61
|
+
|
62
|
+
TypedData_Get_Struct(self, GraphStruct, &graph_typed_data, g);
|
63
|
+
calculate(g, FIX2INT(max_iterations), NUM2DBL(damping), NUM2DBL(tolerance));
|
64
|
+
|
65
|
+
ranks = rb_hash_new();
|
66
|
+
sort_and_normalize_ranks(g, rb_hash_dset, ranks);
|
67
|
+
return ranks;
|
68
|
+
}
|
69
|
+
|
70
|
+
void rb_hash_dset(VALUE hash, VALUE key, double value) {
|
71
|
+
rb_hash_aset(hash, key, DBL2NUM(value));
|
72
|
+
}
|
73
|
+
|
74
|
+
//////////////////////////////////////////////////////////////////////////////////////
|
75
|
+
|
76
|
+
void free_graph(void *data) {
|
77
|
+
Graph g = (Graph)data;
|
78
|
+
free_node_list(g->nodes, free_node);
|
79
|
+
free_node_list(g->dangling_nodes, NULL);
|
80
|
+
free(g->node_lookup);
|
81
|
+
free(g);
|
82
|
+
}
|
83
|
+
|
84
|
+
void free_node(Node n) {
|
85
|
+
free_edge_list(n->source_edges, free_edge);
|
86
|
+
free(n);
|
87
|
+
}
|
88
|
+
|
89
|
+
void free_node_list(NodeList nodes, void (*free_item)(Node)) {
|
90
|
+
NodeList tmp;
|
91
|
+
while (nodes != NULL) {
|
92
|
+
tmp = nodes;
|
93
|
+
nodes = nodes->next;
|
94
|
+
if (free_item) {
|
95
|
+
free_item(tmp->node);
|
96
|
+
}
|
97
|
+
free(tmp);
|
98
|
+
}
|
99
|
+
}
|
100
|
+
|
101
|
+
void free_edge(Edge e) {
|
102
|
+
// Assume source node was allocated elsewhere and will be free'd elsewhere
|
103
|
+
free(e);
|
104
|
+
}
|
105
|
+
|
106
|
+
void free_edge_list(EdgeList edges, void (*free_item)(Edge)) {
|
107
|
+
EdgeList tmp;
|
108
|
+
while (edges != NULL) {
|
109
|
+
tmp = edges;
|
110
|
+
edges = edges->next;
|
111
|
+
if (free_item) {
|
112
|
+
free_item(tmp->edge);
|
113
|
+
}
|
114
|
+
free(tmp);
|
115
|
+
}
|
116
|
+
}
|
117
|
+
|
118
|
+
//////////////////////////////////////////////////////////////////////////////////////
|
119
|
+
|
120
|
+
Node add_node(Graph g, VALUE label) {
|
121
|
+
NodeList tmp = malloc(NODE_LIST_SIZE);
|
122
|
+
|
123
|
+
tmp->node = malloc(NODE_SIZE);
|
124
|
+
tmp->node->label = label;
|
125
|
+
tmp->node->source_edges = NULL;
|
126
|
+
tmp->node->rank = 0.0;
|
127
|
+
tmp->node->prev_rank = 0.0;
|
128
|
+
tmp->node->outbound_weight_total = 0.0;
|
129
|
+
|
130
|
+
tmp->next = g->nodes;
|
131
|
+
g->nodes = tmp;
|
132
|
+
g->node_count += 1;
|
133
|
+
|
134
|
+
return tmp->node;
|
135
|
+
}
|
136
|
+
|
137
|
+
Node add_dangling_node(Graph g, Node n) {
|
138
|
+
NodeList tmp = malloc(NODE_LIST_SIZE);
|
139
|
+
|
140
|
+
tmp->node = n;
|
141
|
+
tmp->next = g->dangling_nodes;
|
142
|
+
g->dangling_nodes = tmp;
|
143
|
+
|
144
|
+
return n;
|
145
|
+
}
|
146
|
+
|
147
|
+
Edge add_edge(Node source, Node destination, double weight) {
|
148
|
+
EdgeList tmp = malloc(EDGE_LIST_SIZE);
|
149
|
+
|
150
|
+
tmp->edge = malloc(EDGE_SIZE);
|
151
|
+
tmp->edge->source = source;
|
152
|
+
tmp->edge->weight = weight;
|
153
|
+
|
154
|
+
tmp->next = destination->source_edges;
|
155
|
+
destination->source_edges = tmp;
|
156
|
+
source->outbound_weight_total += weight;
|
157
|
+
|
158
|
+
return tmp->edge;
|
159
|
+
}
|
160
|
+
|
161
|
+
Edge add_edge_with_labels(Graph g, VALUE source_label, VALUE dest_label, double weight) {
|
162
|
+
Node source, dest;
|
163
|
+
|
164
|
+
source = lookup_node(g, source_label);
|
165
|
+
dest = lookup_node(g, dest_label);
|
166
|
+
|
167
|
+
return add_edge(source, dest, weight);
|
168
|
+
}
|
169
|
+
|
170
|
+
Node lookup_node(Graph g, VALUE label) {
|
171
|
+
Node n;
|
172
|
+
|
173
|
+
if (!st_lookup(g->node_lookup, (st_data_t)label, (st_data_t *)&n)) {
|
174
|
+
n = add_node(g, label);
|
175
|
+
st_add_direct(g->node_lookup, (st_data_t)label, (st_data_t)n);
|
176
|
+
}
|
177
|
+
return n;
|
178
|
+
}
|
179
|
+
|
180
|
+
//////////////////////////////////////////////////////////////////////////////////////
|
181
|
+
|
182
|
+
void calculate_start(Graph g) {
|
183
|
+
NodeList nodes;
|
184
|
+
Node source, destination;
|
185
|
+
EdgeList edges;
|
186
|
+
Edge e;
|
187
|
+
|
188
|
+
for (nodes = g->nodes; nodes != NULL; nodes = nodes->next) {
|
189
|
+
destination = nodes->node;
|
190
|
+
|
191
|
+
// If there is no outband, this is a "dangling" node
|
192
|
+
if (destination->outbound_weight_total == 0.0) {
|
193
|
+
add_dangling_node(g, destination);
|
194
|
+
}
|
195
|
+
|
196
|
+
// Normalize all source edge weights
|
197
|
+
for (edges = destination->source_edges; edges != NULL; edges = edges->next) {
|
198
|
+
e = edges->edge;
|
199
|
+
source = e->source;
|
200
|
+
e->weight = e->weight / source->outbound_weight_total;
|
201
|
+
}
|
202
|
+
|
203
|
+
// Set the initial rank
|
204
|
+
destination->prev_rank = 0;
|
205
|
+
destination->rank = 1.0 / g->node_count;
|
206
|
+
}
|
207
|
+
}
|
208
|
+
|
209
|
+
void calculate_step(Graph g, double damping) {
|
210
|
+
NodeList nodes, dangling_nodes;
|
211
|
+
Node source, destination;
|
212
|
+
EdgeList edges;
|
213
|
+
Edge e;
|
214
|
+
double sum;
|
215
|
+
|
216
|
+
// Set prev rank to rank for all nodes
|
217
|
+
for (nodes = g->nodes; nodes != NULL; nodes = nodes->next) {
|
218
|
+
destination = nodes->node;
|
219
|
+
destination->prev_rank = destination->rank;
|
220
|
+
}
|
221
|
+
|
222
|
+
// Re-destribute the rankings according to weight
|
223
|
+
for (nodes = g->nodes; nodes != NULL; nodes = nodes->next) {
|
224
|
+
destination = nodes->node;
|
225
|
+
sum = 0.0;
|
226
|
+
for (edges = destination->source_edges; edges != NULL; edges = edges->next) {
|
227
|
+
e = edges->edge;
|
228
|
+
source = e->source;
|
229
|
+
sum += source->prev_rank * e->weight;
|
230
|
+
}
|
231
|
+
for (dangling_nodes = g->dangling_nodes; dangling_nodes != NULL; dangling_nodes = dangling_nodes->next) {
|
232
|
+
source = dangling_nodes->node;
|
233
|
+
sum += source->prev_rank / g->node_count;
|
234
|
+
}
|
235
|
+
destination->rank = damping * sum + (1 - damping) / g->node_count;
|
236
|
+
}
|
237
|
+
}
|
238
|
+
|
239
|
+
// Calculate the Euclidean distance from prev_rank to rank across all nodes
|
240
|
+
double prev_distance(Graph g) {
|
241
|
+
NodeList nodes;
|
242
|
+
Node n;
|
243
|
+
double rank_diff, sum_squares = 0.0;
|
244
|
+
|
245
|
+
for (nodes = g->nodes; nodes != NULL; nodes = nodes->next) {
|
246
|
+
n = nodes->node;
|
247
|
+
rank_diff = n->prev_rank - n->rank;
|
248
|
+
sum_squares += rank_diff * rank_diff;
|
249
|
+
}
|
250
|
+
|
251
|
+
return sqrt(sum_squares);
|
252
|
+
}
|
253
|
+
|
254
|
+
void calculate(Graph g, int max_iterations, double damping, double tolerance) {
|
255
|
+
calculate_start(g);
|
256
|
+
|
257
|
+
while (max_iterations != 0) { // If negative one, allow to go without limit
|
258
|
+
calculate_step(g, damping);
|
259
|
+
if (prev_distance(g) < tolerance) {
|
260
|
+
break;
|
261
|
+
}
|
262
|
+
max_iterations--;
|
263
|
+
}
|
264
|
+
}
|
265
|
+
|
266
|
+
int node_compare(const void *v1, const void *v2) {
|
267
|
+
double rank1, rank2, cmp;
|
268
|
+
|
269
|
+
rank1 = (*(Node *)v1)->rank;
|
270
|
+
rank2 = (*(Node *)v2)->rank;
|
271
|
+
cmp = rank2 - rank1; // Decreasing order
|
272
|
+
if (cmp < 0) return -1;
|
273
|
+
if (cmp > 0) return 1;
|
274
|
+
return 0;
|
275
|
+
}
|
276
|
+
|
277
|
+
void sort_and_normalize_ranks(Graph g, void (*callback)(VALUE, VALUE, double), VALUE callback_arg) {
|
278
|
+
NodeList nodes;
|
279
|
+
Node n;
|
280
|
+
double sum = 0.0;
|
281
|
+
unsigned long i;
|
282
|
+
Node *tmp;
|
283
|
+
|
284
|
+
i = g->node_count;
|
285
|
+
tmp = malloc(g->node_count * sizeof(Node));
|
286
|
+
for (nodes = g->nodes; nodes != NULL; nodes = nodes->next) {
|
287
|
+
n = nodes->node;
|
288
|
+
tmp[--i] = n;
|
289
|
+
sum += n->rank;
|
290
|
+
}
|
291
|
+
|
292
|
+
qsort(tmp, g->node_count, sizeof(Node), node_compare);
|
293
|
+
|
294
|
+
for (i = 0; i < g->node_count; i++) {
|
295
|
+
n = tmp[i];
|
296
|
+
callback(callback_arg, n->label, n->rank / sum);
|
297
|
+
}
|
298
|
+
|
299
|
+
free(tmp);
|
300
|
+
}
|
@@ -0,0 +1,93 @@
|
|
1
|
+
#ifndef PAGE_RANK_SPARSE_NATIVE_H
|
2
|
+
#define PAGE_RANK_SPARSE_NATIVE_H
|
3
|
+
|
4
|
+
#include <ruby.h>
|
5
|
+
|
6
|
+
struct NodeListStruct;
|
7
|
+
typedef struct NodeListStruct* NodeList;
|
8
|
+
|
9
|
+
typedef struct NodeListStruct {
|
10
|
+
struct NodeStruct *node;
|
11
|
+
struct NodeListStruct *next;
|
12
|
+
} NodeListStruct;
|
13
|
+
|
14
|
+
//////////////////////////////////////////////////////////////////////////////////////
|
15
|
+
|
16
|
+
struct EdgeListStruct;
|
17
|
+
typedef struct EdgeListStruct* EdgeList;
|
18
|
+
|
19
|
+
typedef struct EdgeListStruct {
|
20
|
+
struct EdgeStruct *edge;
|
21
|
+
struct EdgeListStruct *next;
|
22
|
+
} EdgeListStruct;
|
23
|
+
|
24
|
+
//////////////////////////////////////////////////////////////////////////////////////
|
25
|
+
|
26
|
+
struct NodeStruct;
|
27
|
+
typedef struct NodeStruct* Node;
|
28
|
+
|
29
|
+
typedef struct NodeStruct {
|
30
|
+
EdgeList source_edges;
|
31
|
+
VALUE label;
|
32
|
+
double prev_rank;
|
33
|
+
double rank;
|
34
|
+
double outbound_weight_total;
|
35
|
+
} NodeStruct;
|
36
|
+
|
37
|
+
//////////////////////////////////////////////////////////////////////////////////////
|
38
|
+
|
39
|
+
struct EdgeStruct;
|
40
|
+
typedef struct EdgeStruct* Edge;
|
41
|
+
|
42
|
+
typedef struct EdgeStruct {
|
43
|
+
Node source;
|
44
|
+
double weight;
|
45
|
+
} EdgeStruct;
|
46
|
+
|
47
|
+
//////////////////////////////////////////////////////////////////////////////////////
|
48
|
+
|
49
|
+
struct GraphStruct;
|
50
|
+
typedef struct GraphStruct* Graph;
|
51
|
+
|
52
|
+
typedef struct GraphStruct {
|
53
|
+
unsigned long node_count;
|
54
|
+
NodeList nodes;
|
55
|
+
NodeList dangling_nodes;
|
56
|
+
st_table *node_lookup;
|
57
|
+
} GraphStruct;
|
58
|
+
|
59
|
+
//////////////////////////////////////////////////////////////////////////////////////
|
60
|
+
|
61
|
+
void free_graph(void *data);
|
62
|
+
void free_node(Node n);
|
63
|
+
void free_node_list(NodeList nodes, void (*free_item)(Node));
|
64
|
+
void free_edge(Edge e);
|
65
|
+
void free_edge_list(EdgeList edges, void (*free_item)(Edge));
|
66
|
+
|
67
|
+
//////////////////////////////////////////////////////////////////////////////////////
|
68
|
+
|
69
|
+
Node add_node(Graph g, VALUE label);
|
70
|
+
Node add_dangling_node(Graph g, Node n);
|
71
|
+
Edge add_edge(Node source, Node destination, double weight);
|
72
|
+
Edge add_edge_with_labels(Graph g, VALUE source_label, VALUE dest_label, double weight);
|
73
|
+
Node lookup_node(Graph g, VALUE label);
|
74
|
+
|
75
|
+
//////////////////////////////////////////////////////////////////////////////////////
|
76
|
+
|
77
|
+
void calculate_start(Graph g);
|
78
|
+
void calculate_step(Graph g, double damping);
|
79
|
+
double prev_distance(Graph g);
|
80
|
+
void calculate(Graph g, int max_iterations, double damping, double tolerance);
|
81
|
+
int node_compare(const void *v1, const void *v2);
|
82
|
+
void sort_and_normalize_ranks(Graph g, void (*callback)(VALUE, VALUE, double), VALUE callback_arg);
|
83
|
+
|
84
|
+
//////////////////////////////////////////////////////////////////////////////////////
|
85
|
+
|
86
|
+
void Init_sparse_native();
|
87
|
+
VALUE sparse_native_allocate(VALUE self);
|
88
|
+
VALUE sparse_native_add_edge(VALUE self, VALUE source, VALUE dest, VALUE weight);
|
89
|
+
VALUE sparse_native_calculate(VALUE self, VALUE max_iterations, VALUE damping, VALUE tolerance);
|
90
|
+
VALUE sorted_and_normalized_ranks(Graph g);
|
91
|
+
void rb_hash_dset(VALUE hash, VALUE key, double value);
|
92
|
+
|
93
|
+
#endif
|
data/lib/page_rank/base.rb
CHANGED
@@ -7,6 +7,8 @@ module PageRank
|
|
7
7
|
##
|
8
8
|
class Base
|
9
9
|
|
10
|
+
attr_reader :damping, :tolerance
|
11
|
+
|
10
12
|
# @param (see #damping=)
|
11
13
|
# @param (see #tolerance=)
|
12
14
|
def initialize(damping: nil, tolerance: nil, **_)
|
@@ -19,8 +21,7 @@ module PageRank
|
|
19
21
|
# @return [Float]
|
20
22
|
def damping=(damping)
|
21
23
|
@damping = damping || 0.85
|
22
|
-
raise ArgumentError
|
23
|
-
@damping
|
24
|
+
raise ArgumentError, 'Invalid damping factor' if @damping <= 0 || @damping > 1
|
24
25
|
end
|
25
26
|
|
26
27
|
# Set the tolerance value
|
@@ -28,8 +29,7 @@ module PageRank
|
|
28
29
|
# @return [Float]
|
29
30
|
def tolerance=(tolerance)
|
30
31
|
@tolerance = tolerance || 0.0001
|
31
|
-
raise ArgumentError
|
32
|
-
@tolerance
|
32
|
+
raise ArgumentError, 'Invalid tolerance factor' if @tolerance.negative? || @tolerance > 1
|
33
33
|
end
|
34
34
|
|
35
35
|
# Adds a directed (and optionally weighted) edge to the graph
|
@@ -46,9 +46,12 @@ module PageRank
|
|
46
46
|
def calculate(max_iterations: -1, **_)
|
47
47
|
ranks = initial_ranks
|
48
48
|
loop do
|
49
|
-
break if max_iterations
|
50
|
-
|
51
|
-
|
49
|
+
break if max_iterations.zero?
|
50
|
+
|
51
|
+
prev_ranks = ranks
|
52
|
+
ranks = calculate_step(ranks)
|
53
|
+
break if distance(ranks, prev_ranks) < tolerance
|
54
|
+
|
52
55
|
max_iterations -= 1
|
53
56
|
end
|
54
57
|
sort_ranks(ranks)
|
@@ -77,9 +80,9 @@ module PageRank
|
|
77
80
|
end
|
78
81
|
|
79
82
|
# Calculate the Euclidean distance from one ranking to the next iteration
|
80
|
-
def distance(
|
83
|
+
def distance(vector1, vector2)
|
81
84
|
sum_squares = node_count.times.reduce(0.0) do |sum, i|
|
82
|
-
d =
|
85
|
+
d = vector1[i] - vector2[i]
|
83
86
|
sum + d * d
|
84
87
|
end
|
85
88
|
Math.sqrt(sum_squares)
|
data/lib/page_rank/dense.rb
CHANGED
@@ -32,6 +32,7 @@ module PageRank
|
|
32
32
|
# @return (see Base#add)
|
33
33
|
def add(source, dest, weight: 1.0)
|
34
34
|
return if source == dest
|
35
|
+
|
35
36
|
source_idx = index(source)
|
36
37
|
dest_idx = index(dest)
|
37
38
|
@out_links[source_idx] ||= []
|
@@ -72,13 +73,13 @@ module PageRank
|
|
72
73
|
|
73
74
|
def to_matrix
|
74
75
|
total_out_weights = @out_links.map do |links|
|
75
|
-
links
|
76
|
+
links&.compact&.reduce(:+)
|
76
77
|
end
|
77
78
|
Matrix.build(node_count, node_count) do |dest_idx, source_idx|
|
78
79
|
total = total_out_weights[source_idx]
|
79
80
|
if total
|
80
81
|
w = @out_links[source_idx][dest_idx] || 0.0
|
81
|
-
|
82
|
+
damping * w / total + (1 - damping) / node_count.to_f
|
82
83
|
else
|
83
84
|
1.0 / node_count.to_f
|
84
85
|
end
|
data/lib/page_rank/sparse.rb
CHANGED
@@ -1,5 +1,3 @@
|
|
1
|
-
require 'set'
|
2
|
-
|
3
1
|
module PageRank
|
4
2
|
##
|
5
3
|
# Implementation of PageRank using a sparse matrix representation of the graph
|
@@ -33,6 +31,7 @@ module PageRank
|
|
33
31
|
# @return (see Base#add)
|
34
32
|
def add(source, dest, weight: 1.0)
|
35
33
|
return false if source == dest
|
34
|
+
|
36
35
|
@graph[dest] ||= Set.new
|
37
36
|
@graph[dest] << source
|
38
37
|
@weights[source] ||= Hash.new(0.0)
|
@@ -53,8 +52,8 @@ module PageRank
|
|
53
52
|
def initial_ranks
|
54
53
|
@dangling_nodes = @nodes - @weight_totals.keys
|
55
54
|
@normalized_weights = @weights.each_with_object({}) do |(source, values), h|
|
56
|
-
h[source] = values.
|
57
|
-
|
55
|
+
h[source] = values.transform_values do |w|
|
56
|
+
w / @weight_totals[source]
|
58
57
|
end
|
59
58
|
end
|
60
59
|
Hash[@nodes.map { |k| [k, 1.0 / node_count.to_f] }]
|
@@ -69,7 +68,7 @@ module PageRank
|
|
69
68
|
@dangling_nodes.each do |source|
|
70
69
|
sum += ranks[source] / node_count.to_f
|
71
70
|
end
|
72
|
-
new_ranks[dest] =
|
71
|
+
new_ranks[dest] = damping * sum + (1 - damping) / node_count
|
73
72
|
end
|
74
73
|
end
|
75
74
|
|
@@ -79,8 +78,8 @@ module PageRank
|
|
79
78
|
Hash[ranks.map { |k, v| [k, v / sum] }.sort_by { |_, v| -v }]
|
80
79
|
end
|
81
80
|
|
82
|
-
def distance(
|
83
|
-
super(
|
81
|
+
def distance(vector1, vector2)
|
82
|
+
super(vector1.values.to_a, vector2.values.to_a)
|
84
83
|
end
|
85
84
|
|
86
85
|
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module PageRank
|
2
|
+
class SparseNative < Base
|
3
|
+
|
4
|
+
#require 'page_rank/sparse_native.so'
|
5
|
+
|
6
|
+
# @param (see Base#add)
|
7
|
+
# @param weight [Float] Optional weight for the graph edge
|
8
|
+
# @return (see Base#add)
|
9
|
+
def add(source, dest, weight: 1.0)
|
10
|
+
_add_edge(source, dest, weight) unless source == dest
|
11
|
+
end
|
12
|
+
|
13
|
+
# Perform the PageRank calculation
|
14
|
+
# @param max_iterations [Fixnum] Maximum number of PageRank iterations to perform (or -1 for no max)
|
15
|
+
# @return [Hash<Object, Float>] of nodes with rank
|
16
|
+
def calculate(max_iterations: -1, **_)
|
17
|
+
_calculate(max_iterations, damping, tolerance)
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
21
|
+
end
|
data/lib/page_rank.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
require 'set'
|
2
|
+
|
1
3
|
##
|
2
4
|
# A module for supporting Ruby implementations of PageRank. Rather than rely on
|
3
5
|
# one single implementation, this module allows for multiple implementations that
|
@@ -15,16 +17,17 @@
|
|
15
17
|
##
|
16
18
|
module PageRank
|
17
19
|
|
18
|
-
autoload :Base,
|
19
|
-
autoload :Dense,
|
20
|
-
autoload :Sparse,
|
20
|
+
autoload :Base, 'page_rank/base'
|
21
|
+
autoload :Dense, 'page_rank/dense'
|
22
|
+
autoload :Sparse, 'page_rank/sparse'
|
23
|
+
autoload :SparseNative, 'page_rank/sparse_native'
|
21
24
|
|
22
25
|
# @option options [Symbol] :strategy PageRank strategy to use (either :sparse or :dense)
|
23
26
|
# @option options [Float] :damping The probability of following the graph vs. randomly choosing a new node
|
24
27
|
# @option options [Float] :tolerance The desired accuracy of the results
|
25
28
|
# @return [PageRank::Base]
|
26
29
|
def self.new(strategy: :sparse, **options)
|
27
|
-
const_get(strategy.to_s.capitalize).new(**options)
|
30
|
+
const_get(strategy.to_s.split('_').map(&:capitalize).join).new(**options)
|
28
31
|
end
|
29
32
|
|
30
33
|
# Convenience method to quickly calculate PageRank. In the calling block, graph edges can be added.
|