text_rank 1.2.4 → 1.3.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.codeclimate.yml +11 -1
- data/.github/workflows/ci.yml +48 -0
- data/.gitignore +4 -0
- data/.rubocop.yml +105 -45
- data/.ruby-version +1 -1
- data/.yardopts +6 -0
- data/CODE_OF_CONDUCT.md +120 -36
- data/README.md +4 -3
- data/Rakefile +9 -7
- data/ext/text_rank/extconf.rb +3 -0
- data/ext/text_rank/page_rank_sparse_native.c +293 -0
- data/ext/text_rank/page_rank_sparse_native.h +93 -0
- data/ext/text_rank/text_rank.c +6 -0
- data/lib/page_rank/base.rb +4 -2
- data/lib/page_rank/dense.rb +1 -1
- data/lib/page_rank/sparse.rb +3 -3
- data/lib/page_rank/sparse_native.rb +21 -0
- data/lib/page_rank.rb +5 -4
- data/lib/text_rank/char_filter/ascii_folding.rb +0 -2
- data/lib/text_rank/char_filter/strip_html.rb +1 -0
- data/lib/text_rank/fingerprint.rb +2 -2
- data/lib/text_rank/graph_strategy/coocurrence.rb +6 -6
- data/lib/text_rank/keyword_extractor.rb +13 -5
- data/lib/text_rank/rank_filter/collapse_adjacent.rb +3 -3
- data/lib/text_rank/rank_filter/sort_by_value.rb +1 -1
- data/lib/text_rank/tokenizer/money.rb +2 -4
- data/lib/text_rank/version.rb +1 -1
- data/lib/text_rank.rb +3 -1
- data/text_rank.gemspec +12 -1
- metadata +102 -11
- data/.travis.yml +0 -15
@@ -0,0 +1,293 @@
|
|
1
|
+
#include <ruby.h>
|
2
|
+
#include <math.h>
|
3
|
+
#include <page_rank_sparse_native.h>
|
4
|
+
|
5
|
+
const size_t NODE_LIST_SIZE = sizeof(NodeListStruct);
|
6
|
+
const size_t EDGE_LIST_SIZE = sizeof(EdgeListStruct);
|
7
|
+
const size_t NODE_SIZE = sizeof(NodeStruct);
|
8
|
+
const size_t EDGE_SIZE = sizeof(EdgeStruct);
|
9
|
+
const size_t GRAPH_SIZE = sizeof(GraphStruct);
|
10
|
+
|
11
|
+
static const rb_data_type_t graph_typed_data = {
|
12
|
+
"PageRank/SparseNative/Graph",
|
13
|
+
{ 0, free_graph, },
|
14
|
+
0, 0,
|
15
|
+
RUBY_TYPED_FREE_IMMEDIATELY,
|
16
|
+
};
|
17
|
+
|
18
|
+
|
19
|
+
//////////////////////////////////////////////////////////////////////////////////////
|
20
|
+
|
21
|
+
void Init_sparse_native() {
|
22
|
+
VALUE PageRankModule, SparseNativeClass;
|
23
|
+
|
24
|
+
PageRankModule = rb_const_get(rb_cObject, rb_intern("PageRank"));
|
25
|
+
SparseNativeClass = rb_const_get(PageRankModule, rb_intern("SparseNative"));
|
26
|
+
|
27
|
+
rb_define_alloc_func(SparseNativeClass, sparse_native_allocate);
|
28
|
+
rb_define_private_method(SparseNativeClass, "_add_edge", sparse_native_add_edge, 3);
|
29
|
+
rb_define_private_method(SparseNativeClass, "_calculate", sparse_native_calculate, 3);
|
30
|
+
}
|
31
|
+
|
32
|
+
VALUE sparse_native_allocate(VALUE self) {
|
33
|
+
Graph g = malloc(GRAPH_SIZE);
|
34
|
+
|
35
|
+
// Grab a reference to the hash type used by a generic Ruby {}
|
36
|
+
// which accepts any key and any value. We'll need this type to create
|
37
|
+
// a st_table in which to put arbitrary VALUE keys. This hash type
|
38
|
+
// should be a static constant and thus should be safe to utilize without
|
39
|
+
// fear of garbage collection.
|
40
|
+
const struct st_hash_type *objhash = rb_hash_tbl(rb_hash_new(), "page_rank_sparse_native.c", 40)->type;
|
41
|
+
|
42
|
+
g->node_count = 0;
|
43
|
+
g->nodes = NULL;
|
44
|
+
g->dangling_nodes = NULL;
|
45
|
+
g->node_lookup = st_init_table_with_size(objhash, 0);
|
46
|
+
|
47
|
+
return TypedData_Wrap_Struct(self, &graph_typed_data, g);
|
48
|
+
}
|
49
|
+
|
50
|
+
VALUE sparse_native_add_edge(VALUE self, VALUE source, VALUE dest, VALUE weight) {
|
51
|
+
Graph g;
|
52
|
+
|
53
|
+
TypedData_Get_Struct(self, GraphStruct, &graph_typed_data, g);
|
54
|
+
add_edge_with_labels(g, source, dest, NUM2DBL(weight));
|
55
|
+
return Qnil;
|
56
|
+
}
|
57
|
+
|
58
|
+
VALUE sparse_native_calculate(VALUE self, VALUE max_iterations, VALUE damping, VALUE tolerance) {
|
59
|
+
Graph g;
|
60
|
+
VALUE ranks;
|
61
|
+
|
62
|
+
TypedData_Get_Struct(self, GraphStruct, &graph_typed_data, g);
|
63
|
+
calculate(g, FIX2INT(max_iterations), NUM2DBL(damping), NUM2DBL(tolerance));
|
64
|
+
|
65
|
+
ranks = rb_hash_new();
|
66
|
+
sort_and_normalize_ranks(g, rb_hash_dset, ranks);
|
67
|
+
return ranks;
|
68
|
+
}
|
69
|
+
|
70
|
+
void rb_hash_dset(VALUE hash, VALUE key, double value) {
|
71
|
+
rb_hash_aset(hash, key, DBL2NUM(value));
|
72
|
+
}
|
73
|
+
|
74
|
+
//////////////////////////////////////////////////////////////////////////////////////
|
75
|
+
|
76
|
+
void free_graph(void *data) {
|
77
|
+
Graph g = (Graph)data;
|
78
|
+
free_node_list(g->nodes, free_node);
|
79
|
+
free_node_list(g->dangling_nodes, NULL);
|
80
|
+
free(g->node_lookup);
|
81
|
+
free(g);
|
82
|
+
}
|
83
|
+
|
84
|
+
void free_node(Node n) {
|
85
|
+
free_edge_list(n->source_edges, free_edge);
|
86
|
+
free(n);
|
87
|
+
}
|
88
|
+
|
89
|
+
void free_node_list(NodeList nodes, void (*free_item)(Node)) {
|
90
|
+
while (nodes != NULL) {
|
91
|
+
NodeList tmp = nodes;
|
92
|
+
nodes = nodes->next;
|
93
|
+
if (free_item) {
|
94
|
+
free_item(tmp->node);
|
95
|
+
}
|
96
|
+
free(tmp);
|
97
|
+
}
|
98
|
+
}
|
99
|
+
|
100
|
+
void free_edge(Edge e) {
|
101
|
+
// Assume source node was allocated elsewhere and will be free'd elsewhere
|
102
|
+
free(e);
|
103
|
+
}
|
104
|
+
|
105
|
+
void free_edge_list(EdgeList edges, void (*free_item)(Edge)) {
|
106
|
+
while (edges != NULL) {
|
107
|
+
EdgeList tmp = edges;
|
108
|
+
edges = edges->next;
|
109
|
+
if (free_item) {
|
110
|
+
free_item(tmp->edge);
|
111
|
+
}
|
112
|
+
free(tmp);
|
113
|
+
}
|
114
|
+
}
|
115
|
+
|
116
|
+
//////////////////////////////////////////////////////////////////////////////////////
|
117
|
+
|
118
|
+
Node add_node(Graph g, VALUE label) {
|
119
|
+
NodeList tmp = malloc(NODE_LIST_SIZE);
|
120
|
+
|
121
|
+
tmp->node = malloc(NODE_SIZE);
|
122
|
+
tmp->node->label = label;
|
123
|
+
tmp->node->source_edges = NULL;
|
124
|
+
tmp->node->rank = 0.0;
|
125
|
+
tmp->node->prev_rank = 0.0;
|
126
|
+
tmp->node->outbound_weight_total = 0.0;
|
127
|
+
|
128
|
+
tmp->next = g->nodes;
|
129
|
+
g->nodes = tmp;
|
130
|
+
g->node_count += 1;
|
131
|
+
|
132
|
+
return tmp->node;
|
133
|
+
}
|
134
|
+
|
135
|
+
Node add_dangling_node(Graph g, Node n) {
|
136
|
+
NodeList tmp = malloc(NODE_LIST_SIZE);
|
137
|
+
|
138
|
+
tmp->node = n;
|
139
|
+
tmp->next = g->dangling_nodes;
|
140
|
+
g->dangling_nodes = tmp;
|
141
|
+
|
142
|
+
return n;
|
143
|
+
}
|
144
|
+
|
145
|
+
Edge add_edge(Node source, Node destination, double weight) {
|
146
|
+
EdgeList tmp = malloc(EDGE_LIST_SIZE);
|
147
|
+
|
148
|
+
tmp->edge = malloc(EDGE_SIZE);
|
149
|
+
tmp->edge->source = source;
|
150
|
+
tmp->edge->weight = weight;
|
151
|
+
|
152
|
+
tmp->next = destination->source_edges;
|
153
|
+
destination->source_edges = tmp;
|
154
|
+
source->outbound_weight_total += weight;
|
155
|
+
|
156
|
+
return tmp->edge;
|
157
|
+
}
|
158
|
+
|
159
|
+
Edge add_edge_with_labels(Graph g, VALUE source_label, VALUE dest_label, double weight) {
|
160
|
+
Node source, dest;
|
161
|
+
|
162
|
+
source = lookup_node(g, source_label);
|
163
|
+
dest = lookup_node(g, dest_label);
|
164
|
+
|
165
|
+
return add_edge(source, dest, weight);
|
166
|
+
}
|
167
|
+
|
168
|
+
Node lookup_node(Graph g, VALUE label) {
|
169
|
+
Node n;
|
170
|
+
|
171
|
+
if (!st_lookup(g->node_lookup, (st_data_t)label, (st_data_t *)&n)) {
|
172
|
+
n = add_node(g, label);
|
173
|
+
st_add_direct(g->node_lookup, (st_data_t)label, (st_data_t)n);
|
174
|
+
}
|
175
|
+
return n;
|
176
|
+
}
|
177
|
+
|
178
|
+
//////////////////////////////////////////////////////////////////////////////////////
|
179
|
+
|
180
|
+
void calculate_start(Graph g) {
|
181
|
+
NodeList nodes;
|
182
|
+
Node source, destination;
|
183
|
+
EdgeList edges;
|
184
|
+
Edge e;
|
185
|
+
|
186
|
+
for (nodes = g->nodes; nodes != NULL; nodes = nodes->next) {
|
187
|
+
destination = nodes->node;
|
188
|
+
|
189
|
+
// If there is no outband, this is a "dangling" node
|
190
|
+
if (destination->outbound_weight_total == 0.0) {
|
191
|
+
add_dangling_node(g, destination);
|
192
|
+
}
|
193
|
+
|
194
|
+
// Normalize all source edge weights
|
195
|
+
for (edges = destination->source_edges; edges != NULL; edges = edges->next) {
|
196
|
+
e = edges->edge;
|
197
|
+
source = e->source;
|
198
|
+
e->weight = e->weight / source->outbound_weight_total;
|
199
|
+
}
|
200
|
+
|
201
|
+
// Set the initial rank
|
202
|
+
destination->prev_rank = 0;
|
203
|
+
destination->rank = 1.0 / g->node_count;
|
204
|
+
}
|
205
|
+
}
|
206
|
+
|
207
|
+
void calculate_step(Graph g, double damping) {
|
208
|
+
NodeList nodes, dangling_nodes;
|
209
|
+
Node source, destination;
|
210
|
+
EdgeList edges;
|
211
|
+
Edge e;
|
212
|
+
|
213
|
+
// Set prev rank to rank for all nodes
|
214
|
+
for (nodes = g->nodes; nodes != NULL; nodes = nodes->next) {
|
215
|
+
destination = nodes->node;
|
216
|
+
destination->prev_rank = destination->rank;
|
217
|
+
}
|
218
|
+
|
219
|
+
// Re-destribute the rankings according to weight
|
220
|
+
for (nodes = g->nodes; nodes != NULL; nodes = nodes->next) {
|
221
|
+
destination = nodes->node;
|
222
|
+
double sum = 0.0;
|
223
|
+
for (edges = destination->source_edges; edges != NULL; edges = edges->next) {
|
224
|
+
e = edges->edge;
|
225
|
+
source = e->source;
|
226
|
+
sum += source->prev_rank * e->weight;
|
227
|
+
}
|
228
|
+
for (dangling_nodes = g->dangling_nodes; dangling_nodes != NULL; dangling_nodes = dangling_nodes->next) {
|
229
|
+
source = dangling_nodes->node;
|
230
|
+
sum += source->prev_rank / g->node_count;
|
231
|
+
}
|
232
|
+
destination->rank = damping * sum + (1 - damping) / g->node_count;
|
233
|
+
}
|
234
|
+
}
|
235
|
+
|
236
|
+
// Calculate the Euclidean distance from prev_rank to rank across all nodes
|
237
|
+
double prev_distance(Graph g) {
|
238
|
+
double sum_squares = 0.0;
|
239
|
+
|
240
|
+
for (NodeList nodes = g->nodes; nodes != NULL; nodes = nodes->next) {
|
241
|
+
Node n = nodes->node;
|
242
|
+
double rank_diff = n->prev_rank - n->rank;
|
243
|
+
sum_squares += rank_diff * rank_diff;
|
244
|
+
}
|
245
|
+
|
246
|
+
return sqrt(sum_squares);
|
247
|
+
}
|
248
|
+
|
249
|
+
void calculate(Graph g, int max_iterations, double damping, double tolerance) {
|
250
|
+
calculate_start(g);
|
251
|
+
|
252
|
+
while (max_iterations != 0) { // If negative one, allow to go without limit
|
253
|
+
calculate_step(g, damping);
|
254
|
+
if (prev_distance(g) < tolerance) {
|
255
|
+
break;
|
256
|
+
}
|
257
|
+
max_iterations--;
|
258
|
+
}
|
259
|
+
}
|
260
|
+
|
261
|
+
int node_compare(const void *v1, const void *v2) {
|
262
|
+
double rank1 = (*(Node *)v1)->rank;
|
263
|
+
double rank2 = (*(Node *)v2)->rank;
|
264
|
+
double cmp = rank2 - rank1; // Decreasing order
|
265
|
+
if (cmp < 0) return -1;
|
266
|
+
if (cmp > 0) return 1;
|
267
|
+
return 0;
|
268
|
+
}
|
269
|
+
|
270
|
+
void sort_and_normalize_ranks(Graph g, void (*callback)(VALUE, VALUE, double), VALUE callback_arg) {
|
271
|
+
NodeList nodes;
|
272
|
+
Node n;
|
273
|
+
double sum = 0.0;
|
274
|
+
unsigned long i;
|
275
|
+
Node *tmp;
|
276
|
+
|
277
|
+
i = g->node_count;
|
278
|
+
tmp = malloc(g->node_count * sizeof(Node));
|
279
|
+
for (nodes = g->nodes; nodes != NULL; nodes = nodes->next) {
|
280
|
+
n = nodes->node;
|
281
|
+
tmp[--i] = n;
|
282
|
+
sum += n->rank;
|
283
|
+
}
|
284
|
+
|
285
|
+
qsort(tmp, g->node_count, sizeof(Node), node_compare);
|
286
|
+
|
287
|
+
for (i = 0; i < g->node_count; i++) {
|
288
|
+
n = tmp[i];
|
289
|
+
callback(callback_arg, n->label, n->rank / sum);
|
290
|
+
}
|
291
|
+
|
292
|
+
free(tmp);
|
293
|
+
}
|
@@ -0,0 +1,93 @@
|
|
1
|
+
#ifndef PAGE_RANK_SPARSE_NATIVE_H
|
2
|
+
#define PAGE_RANK_SPARSE_NATIVE_H
|
3
|
+
|
4
|
+
#include <ruby.h>
|
5
|
+
|
6
|
+
struct NodeListStruct;
|
7
|
+
typedef struct NodeListStruct* NodeList;
|
8
|
+
|
9
|
+
typedef struct NodeListStruct {
|
10
|
+
struct NodeStruct *node;
|
11
|
+
struct NodeListStruct *next;
|
12
|
+
} NodeListStruct;
|
13
|
+
|
14
|
+
//////////////////////////////////////////////////////////////////////////////////////
|
15
|
+
|
16
|
+
struct EdgeListStruct;
|
17
|
+
typedef struct EdgeListStruct* EdgeList;
|
18
|
+
|
19
|
+
typedef struct EdgeListStruct {
|
20
|
+
struct EdgeStruct *edge;
|
21
|
+
struct EdgeListStruct *next;
|
22
|
+
} EdgeListStruct;
|
23
|
+
|
24
|
+
//////////////////////////////////////////////////////////////////////////////////////
|
25
|
+
|
26
|
+
struct NodeStruct;
|
27
|
+
typedef struct NodeStruct* Node;
|
28
|
+
|
29
|
+
typedef struct NodeStruct {
|
30
|
+
EdgeList source_edges;
|
31
|
+
VALUE label;
|
32
|
+
double prev_rank;
|
33
|
+
double rank;
|
34
|
+
double outbound_weight_total;
|
35
|
+
} NodeStruct;
|
36
|
+
|
37
|
+
//////////////////////////////////////////////////////////////////////////////////////
|
38
|
+
|
39
|
+
struct EdgeStruct;
|
40
|
+
typedef struct EdgeStruct* Edge;
|
41
|
+
|
42
|
+
typedef struct EdgeStruct {
|
43
|
+
Node source;
|
44
|
+
double weight;
|
45
|
+
} EdgeStruct;
|
46
|
+
|
47
|
+
//////////////////////////////////////////////////////////////////////////////////////
|
48
|
+
|
49
|
+
struct GraphStruct;
|
50
|
+
typedef struct GraphStruct* Graph;
|
51
|
+
|
52
|
+
typedef struct GraphStruct {
|
53
|
+
unsigned long node_count;
|
54
|
+
NodeList nodes;
|
55
|
+
NodeList dangling_nodes;
|
56
|
+
st_table *node_lookup;
|
57
|
+
} GraphStruct;
|
58
|
+
|
59
|
+
//////////////////////////////////////////////////////////////////////////////////////
|
60
|
+
|
61
|
+
void free_graph(void *data);
|
62
|
+
void free_node(Node n);
|
63
|
+
void free_node_list(NodeList nodes, void (*free_item)(Node));
|
64
|
+
void free_edge(Edge e);
|
65
|
+
void free_edge_list(EdgeList edges, void (*free_item)(Edge));
|
66
|
+
|
67
|
+
//////////////////////////////////////////////////////////////////////////////////////
|
68
|
+
|
69
|
+
Node add_node(Graph g, VALUE label);
|
70
|
+
Node add_dangling_node(Graph g, Node n);
|
71
|
+
Edge add_edge(Node source, Node destination, double weight);
|
72
|
+
Edge add_edge_with_labels(Graph g, VALUE source_label, VALUE dest_label, double weight);
|
73
|
+
Node lookup_node(Graph g, VALUE label);
|
74
|
+
|
75
|
+
//////////////////////////////////////////////////////////////////////////////////////
|
76
|
+
|
77
|
+
void calculate_start(Graph g);
|
78
|
+
void calculate_step(Graph g, double damping);
|
79
|
+
double prev_distance(Graph g);
|
80
|
+
void calculate(Graph g, int max_iterations, double damping, double tolerance);
|
81
|
+
int node_compare(const void *v1, const void *v2);
|
82
|
+
void sort_and_normalize_ranks(Graph g, void (*callback)(VALUE, VALUE, double), VALUE callback_arg);
|
83
|
+
|
84
|
+
//////////////////////////////////////////////////////////////////////////////////////
|
85
|
+
|
86
|
+
void Init_sparse_native();
|
87
|
+
VALUE sparse_native_allocate(VALUE self);
|
88
|
+
VALUE sparse_native_add_edge(VALUE self, VALUE source, VALUE dest, VALUE weight);
|
89
|
+
VALUE sparse_native_calculate(VALUE self, VALUE max_iterations, VALUE damping, VALUE tolerance);
|
90
|
+
VALUE sorted_and_normalized_ranks(Graph g);
|
91
|
+
void rb_hash_dset(VALUE hash, VALUE key, double value);
|
92
|
+
|
93
|
+
#endif
|
data/lib/page_rank/base.rb
CHANGED
@@ -7,6 +7,8 @@ module PageRank
|
|
7
7
|
##
|
8
8
|
class Base
|
9
9
|
|
10
|
+
attr_reader :damping, :tolerance
|
11
|
+
|
10
12
|
# @param (see #damping=)
|
11
13
|
# @param (see #tolerance=)
|
12
14
|
def initialize(damping: nil, tolerance: nil, **_)
|
@@ -48,7 +50,7 @@ module PageRank
|
|
48
50
|
|
49
51
|
prev_ranks = ranks
|
50
52
|
ranks = calculate_step(ranks)
|
51
|
-
break if distance(ranks, prev_ranks) <
|
53
|
+
break if distance(ranks, prev_ranks) < tolerance
|
52
54
|
|
53
55
|
max_iterations -= 1
|
54
56
|
end
|
@@ -81,7 +83,7 @@ module PageRank
|
|
81
83
|
def distance(vector1, vector2)
|
82
84
|
sum_squares = node_count.times.reduce(0.0) do |sum, i|
|
83
85
|
d = vector1[i] - vector2[i]
|
84
|
-
sum + d * d
|
86
|
+
sum + (d * d)
|
85
87
|
end
|
86
88
|
Math.sqrt(sum_squares)
|
87
89
|
end
|
data/lib/page_rank/dense.rb
CHANGED
@@ -79,7 +79,7 @@ module PageRank
|
|
79
79
|
total = total_out_weights[source_idx]
|
80
80
|
if total
|
81
81
|
w = @out_links[source_idx][dest_idx] || 0.0
|
82
|
-
|
82
|
+
(damping * w / total) + ((1 - damping) / node_count.to_f)
|
83
83
|
else
|
84
84
|
1.0 / node_count.to_f
|
85
85
|
end
|
data/lib/page_rank/sparse.rb
CHANGED
@@ -56,7 +56,7 @@ module PageRank
|
|
56
56
|
w / @weight_totals[source]
|
57
57
|
end
|
58
58
|
end
|
59
|
-
|
59
|
+
@nodes.to_h { |k| [k, 1.0 / node_count.to_f] }
|
60
60
|
end
|
61
61
|
|
62
62
|
def calculate_step(ranks)
|
@@ -68,14 +68,14 @@ module PageRank
|
|
68
68
|
@dangling_nodes.each do |source|
|
69
69
|
sum += ranks[source] / node_count.to_f
|
70
70
|
end
|
71
|
-
new_ranks[dest] =
|
71
|
+
new_ranks[dest] = (damping * sum) + ((1 - damping) / node_count)
|
72
72
|
end
|
73
73
|
end
|
74
74
|
|
75
75
|
def sort_ranks(ranks)
|
76
76
|
sum = 0.0
|
77
77
|
ranks.each { |_, v| sum += v }
|
78
|
-
|
78
|
+
ranks.map { |k, v| [k, v / sum] }.sort_by { |_, v| -v }.to_h
|
79
79
|
end
|
80
80
|
|
81
81
|
def distance(vector1, vector2)
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module PageRank
|
2
|
+
class SparseNative < Base
|
3
|
+
|
4
|
+
# require 'page_rank/sparse_native.so'
|
5
|
+
|
6
|
+
# @param (see Base#add)
|
7
|
+
# @param weight [Float] Optional weight for the graph edge
|
8
|
+
# @return (see Base#add)
|
9
|
+
def add(source, dest, weight: 1.0)
|
10
|
+
_add_edge(source, dest, weight) unless source == dest
|
11
|
+
end
|
12
|
+
|
13
|
+
# Perform the PageRank calculation
|
14
|
+
# @param max_iterations [Fixnum] Maximum number of PageRank iterations to perform (or -1 for no max)
|
15
|
+
# @return [Hash<Object, Float>] of nodes with rank
|
16
|
+
def calculate(max_iterations: -1, **_)
|
17
|
+
_calculate(max_iterations, damping, tolerance)
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
21
|
+
end
|
data/lib/page_rank.rb
CHANGED
@@ -17,16 +17,17 @@ require 'set'
|
|
17
17
|
##
|
18
18
|
module PageRank
|
19
19
|
|
20
|
-
autoload :Base,
|
21
|
-
autoload :Dense,
|
22
|
-
autoload :Sparse,
|
20
|
+
autoload :Base, 'page_rank/base'
|
21
|
+
autoload :Dense, 'page_rank/dense'
|
22
|
+
autoload :Sparse, 'page_rank/sparse'
|
23
|
+
autoload :SparseNative, 'page_rank/sparse_native'
|
23
24
|
|
24
25
|
# @option options [Symbol] :strategy PageRank strategy to use (either :sparse or :dense)
|
25
26
|
# @option options [Float] :damping The probability of following the graph vs. randomly choosing a new node
|
26
27
|
# @option options [Float] :tolerance The desired accuracy of the results
|
27
28
|
# @return [PageRank::Base]
|
28
29
|
def self.new(strategy: :sparse, **options)
|
29
|
-
const_get(strategy.to_s.capitalize).new(**options)
|
30
|
+
const_get(strategy.to_s.split('_').map(&:capitalize).join).new(**options)
|
30
31
|
end
|
31
32
|
|
32
33
|
# Convenience method to quickly calculate PageRank. In the calling block, graph edges can be added.
|
@@ -3,14 +3,12 @@ module TextRank
|
|
3
3
|
##
|
4
4
|
# Characater filter to transform non-ASCII (unicode) characters into ASCII-friendly versions.
|
5
5
|
#
|
6
|
-
# rubocop:disable Style/AsciiComments
|
7
6
|
#
|
8
7
|
# = Example
|
9
8
|
#
|
10
9
|
# AsciiFolding.new.filter!("the Perigordian Abbé then made answer, because a poor beggar of the country of Atrébatie heard some foolish things said")
|
11
10
|
# => "the Perigordian Abbe then made answer, because a poor beggar of the country of Atrebatie heard some foolish things said"
|
12
11
|
#
|
13
|
-
# rubocop:enable Style/AsciiComments
|
14
12
|
#
|
15
13
|
##
|
16
14
|
class AsciiFolding
|
@@ -57,7 +57,7 @@ module TextRank
|
|
57
57
|
end
|
58
58
|
|
59
59
|
# Calculates the "similarity" between this fingerprint and another
|
60
|
-
# @param {Fingerprint} A second fingerprint to compare
|
60
|
+
# @param {Fingerprint} other A second fingerprint to compare
|
61
61
|
# @return [Number] A number between 0.0 (different) and 1.0 (same)
|
62
62
|
def similarity(other)
|
63
63
|
return 1.0 if values == other.values # Short-circuit for efficiency
|
@@ -83,7 +83,7 @@ module TextRank
|
|
83
83
|
|
84
84
|
def norm_factor
|
85
85
|
@norm_factor ||= size.times.reduce(0.0) do |s, i|
|
86
|
-
s + (i + 1) / Math.log(i + 2) / size.to_f
|
86
|
+
s + ((i + 1) / Math.log(i + 2) / size.to_f)
|
87
87
|
end
|
88
88
|
end
|
89
89
|
|
@@ -60,7 +60,7 @@ module TextRank
|
|
60
60
|
# @param graph [PageRank::Base] a PageRank graph into which to add nodes/edges
|
61
61
|
# return [nil]
|
62
62
|
def build_graph(tokens, graph)
|
63
|
-
ngram_window = @ngram_size * 2 + 1
|
63
|
+
ngram_window = (@ngram_size * 2) + 1
|
64
64
|
tokens.size.times do |i|
|
65
65
|
ngram_window.times do |j|
|
66
66
|
consider_ngram_window(tokens, graph, i, j)
|
@@ -71,14 +71,14 @@ module TextRank
|
|
71
71
|
|
72
72
|
private
|
73
73
|
|
74
|
-
def consider_ngram_window(tokens, graph,
|
75
|
-
return if
|
74
|
+
def consider_ngram_window(tokens, graph, idx_i, idx_j)
|
75
|
+
return if idx_j == @ngram_size || idx_i + idx_j < @ngram_size
|
76
76
|
|
77
|
-
token_i = tokens[
|
78
|
-
token_j = tokens[
|
77
|
+
token_i = tokens[idx_i]
|
78
|
+
token_j = tokens[idx_i - @ngram_size + idx_j]
|
79
79
|
|
80
80
|
if token_j
|
81
|
-
graph.add(token_i, token_j, weight: 1.0 / (
|
81
|
+
graph.add(token_i, token_j, weight: 1.0 / (idx_j - @ngram_size).abs)
|
82
82
|
end
|
83
83
|
end
|
84
84
|
|
@@ -71,7 +71,6 @@ module TextRank
|
|
71
71
|
end
|
72
72
|
|
73
73
|
# Sets the graph strategy for producing a graph from tokens
|
74
|
-
# @param strategy [Class, Symbol, #build_graph] Strategy for producing a graph from tokens
|
75
74
|
# @return [Class, Symbol, #build_graph]
|
76
75
|
attr_writer :graph_strategy
|
77
76
|
|
@@ -103,14 +102,23 @@ module TextRank
|
|
103
102
|
end
|
104
103
|
|
105
104
|
# Filter & tokenize text, and return PageRank
|
106
|
-
# @param text [String] unfiltered text to be processed
|
105
|
+
# @param text [String,Array<String>] unfiltered text to be processed
|
107
106
|
# @return [Hash<String, Float>] tokens and page ranks (in descending order)
|
108
107
|
def extract(text, **options)
|
109
|
-
|
108
|
+
text = Array(text)
|
109
|
+
tokens_per_text = text.map do |t|
|
110
|
+
tokenize(t)
|
111
|
+
end
|
110
112
|
graph = PageRank.new(**@page_rank_options)
|
111
|
-
classify(@graph_strategy, context: GraphStrategy)
|
113
|
+
strategy = classify(@graph_strategy, context: GraphStrategy)
|
114
|
+
tokens_per_text.each do |tokens|
|
115
|
+
strategy.build_graph(tokens, graph)
|
116
|
+
end
|
112
117
|
ranks = graph.calculate(**options)
|
113
|
-
|
118
|
+
tokens_per_text.each_with_index do |tokens, i|
|
119
|
+
ranks = apply_rank_filters(ranks, tokens: tokens, original_text: text[i])
|
120
|
+
end
|
121
|
+
ranks
|
114
122
|
end
|
115
123
|
|
116
124
|
private
|
@@ -151,7 +151,7 @@ module TextRank
|
|
151
151
|
# tokenization (e.g. ASCII folding). That's okay. We're just making the best effort we can
|
152
152
|
# to find what we can.
|
153
153
|
def scan_text_for_all_permutations_of(single_tokens)
|
154
|
-
# NOTE that by reversing the order we craft the regex to prefer larger combinations over
|
154
|
+
# NOTE: that by reversing the order we craft the regex to prefer larger combinations over
|
155
155
|
# smaller combinations (or singletons).
|
156
156
|
perms = (1..@max_tokens_to_combine).to_a.reverse.flat_map do |n|
|
157
157
|
scan_text_for_n_permutations_of(single_tokens, n)
|
@@ -162,8 +162,8 @@ module TextRank
|
|
162
162
|
end unless perms.empty?
|
163
163
|
end
|
164
164
|
|
165
|
-
def scan_text_for_n_permutations_of(single_tokens,
|
166
|
-
single_tokens.permutation(
|
165
|
+
def scan_text_for_n_permutations_of(single_tokens, n_perms)
|
166
|
+
single_tokens.permutation(n_perms).map do |perm|
|
167
167
|
unless @permutations_scanned.key?(perm)
|
168
168
|
@permutations_scanned[perm] = 0
|
169
169
|
perm
|
@@ -14,7 +14,7 @@ module TextRank
|
|
14
14
|
# @param ranks [Hash<String, Float>] the results of the PageRank algorithm
|
15
15
|
# @return [Hash<String, Float>]
|
16
16
|
def filter!(ranks, **_)
|
17
|
-
|
17
|
+
ranks.sort_by { |_, v| @descending ? -v : v }.to_h
|
18
18
|
end
|
19
19
|
|
20
20
|
end
|
@@ -1,7 +1,7 @@
|
|
1
1
|
module TextRank
|
2
2
|
module Tokenizer
|
3
3
|
|
4
|
-
CURRENCY_SYMBOLS =
|
4
|
+
CURRENCY_SYMBOLS = "[#{[
|
5
5
|
"\u00a4", # Generic Currency Symbol
|
6
6
|
"\u0024", # Dollar Sign
|
7
7
|
"\u00a2", # Cent Sign
|
@@ -26,14 +26,13 @@ module TextRank
|
|
26
26
|
"\u20ab", # Dong Sign
|
27
27
|
"\u0025", # Percent
|
28
28
|
"\u2030", # Per Million
|
29
|
-
].join
|
29
|
+
].join}]"
|
30
30
|
private_constant :CURRENCY_SYMBOLS # Do not expose this to avoid confusion
|
31
31
|
|
32
32
|
##
|
33
33
|
# A tokenizer regex that preserves money or formatted numbers as a single token. This
|
34
34
|
# currently supports 24 different currency symbols:
|
35
35
|
#
|
36
|
-
# rubocop:disable Style/AsciiComments
|
37
36
|
#
|
38
37
|
# * ¤
|
39
38
|
# * $
|
@@ -60,7 +59,6 @@ module TextRank
|
|
60
59
|
# * %
|
61
60
|
# * ‰
|
62
61
|
|
63
|
-
# rubocop:enable Style/AsciiComments
|
64
62
|
#
|
65
63
|
# It also supports two alternative formats for negatives as well as optional three digit comma
|
66
64
|
# separation and optional decimals.
|