text_rank 1.2.3 → 1.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (43) hide show
  1. checksums.yaml +4 -4
  2. data/.codeclimate.yml +1 -1
  3. data/.gitignore +4 -0
  4. data/.rubocop.yml +7 -0
  5. data/.ruby-version +1 -1
  6. data/.travis.yml +1 -0
  7. data/Rakefile +5 -0
  8. data/bin/console +3 -3
  9. data/ext/text_rank/extconf.rb +3 -0
  10. data/ext/text_rank/page_rank_sparse_native.c +300 -0
  11. data/ext/text_rank/page_rank_sparse_native.h +93 -0
  12. data/ext/text_rank/text_rank.c +5 -0
  13. data/lib/page_rank/base.rb +12 -9
  14. data/lib/page_rank/dense.rb +3 -2
  15. data/lib/page_rank/sparse.rb +6 -7
  16. data/lib/page_rank/sparse_native.rb +21 -0
  17. data/lib/page_rank.rb +7 -4
  18. data/lib/text_rank/char_filter/ascii_folding.rb +5 -1
  19. data/lib/text_rank/char_filter/strip_possessive.rb +2 -2
  20. data/lib/text_rank/char_filter/undo_contractions.rb +1 -137
  21. data/lib/text_rank/char_filter/undo_contractions.yml +135 -0
  22. data/lib/text_rank/char_filter.rb +1 -1
  23. data/lib/text_rank/fingerprint.rb +10 -18
  24. data/lib/text_rank/fingerprint_overlap.rb +55 -0
  25. data/lib/text_rank/graph_strategy/coocurrence.rb +15 -6
  26. data/lib/text_rank/keyword_extractor.rb +32 -25
  27. data/lib/text_rank/rank_filter/collapse_adjacent.rb +48 -25
  28. data/lib/text_rank/rank_filter/normalize_probability.rb +2 -1
  29. data/lib/text_rank/rank_filter/normalize_unit_vector.rb +2 -1
  30. data/lib/text_rank/token_filter/part_of_speech.rb +0 -1
  31. data/lib/text_rank/token_filter/stopwords.rb +1 -321
  32. data/lib/text_rank/token_filter/stopwords.yml +317 -0
  33. data/lib/text_rank/tokenizer/money.rb +11 -6
  34. data/lib/text_rank/tokenizer/number.rb +4 -3
  35. data/lib/text_rank/tokenizer/punctuation.rb +4 -1
  36. data/lib/text_rank/tokenizer/url.rb +3 -0
  37. data/lib/text_rank/tokenizer/whitespace.rb +4 -1
  38. data/lib/text_rank/tokenizer/word.rb +5 -2
  39. data/lib/text_rank/tokenizer.rb +1 -1
  40. data/lib/text_rank/version.rb +3 -1
  41. data/lib/text_rank.rb +14 -9
  42. data/text_rank.gemspec +4 -1
  43. metadata +48 -12
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e1daece496ee8bb075ecc3540d6b8ca64d45c740b1ca34529e094283933d9c97
4
- data.tar.gz: bee8ff77c21cffa95d838e91ae09773b290afd4b9c0224415c2d783f63069b1b
3
+ metadata.gz: 5b25779e7c013e9d0c1d867324f58d40a062bacae0e38f92714e1d3fd7b0e7ef
4
+ data.tar.gz: 34c36b8ff6673092b2463b9f4e0fdaf94a55e50c3e52e4aeec125775c7fa3a9d
5
5
  SHA512:
6
- metadata.gz: 0e9df6c07d6c8bb94a782b61c7877e7fe2e4dd064645c87a0e636bae2236ffbf92e848426d5d8cb4daa04328a6afc25c0a58de3c14ba316c84bb07caa45801f8
7
- data.tar.gz: e7a10407dce5651a05aa208db1136a3c0cc1082c70152b676d3867408cc8107dc8543080e20b8855659b8b58c4fe0063321fc5e8b976933372ff02d6b597ca82
6
+ metadata.gz: 6f03e71745ed96077c63ed376303fcfaa8683f960319d71a405b943aa4a23383938c914b33c867f76f4d979505aeb1d5a0110b51dcc1eadab14cab41d6ee8697
7
+ data.tar.gz: edb17a0ee101254a5afc7c7ee5b084e11ce1d8bfd5083d5069a5e0841751a1f8f175fa167b788186b881e72a1114fa17a8cddc8c3328e00dda812a89b8647196
data/.codeclimate.yml CHANGED
@@ -16,7 +16,7 @@ engines:
16
16
  - 4218049e28199ed950d3cd721df86dce
17
17
  - c8179d0de3a9df18a2c45750d3f8647e
18
18
  - 03f6eee11d86507da564695007106721
19
- channel: rubocop-0-85
19
+ channel: rubocop-1-23-0
20
20
  ratings:
21
21
  paths:
22
22
  - "**.rb"
data/.gitignore CHANGED
@@ -8,3 +8,7 @@
8
8
  /pkg/
9
9
  /spec/reports/
10
10
  /tmp/
11
+
12
+
13
+ *.bundle
14
+ *.so
data/.rubocop.yml CHANGED
@@ -15,6 +15,10 @@ Layout/EmptyLinesAroundModuleBody:
15
15
  Layout/ExtraSpacing:
16
16
  Enabled: false
17
17
 
18
+ Layout/HashAlignment:
19
+ EnforcedHashRocketStyle: table
20
+ EnforcedColonStyle: table
21
+
18
22
  Layout/LineLength:
19
23
  Max: 120
20
24
  Enabled: false
@@ -89,6 +93,9 @@ Style/GuardClause:
89
93
  Style/HashEachMethods:
90
94
  Enabled: true
91
95
 
96
+ Style/HashSyntax:
97
+ Enabled: true
98
+
92
99
  Style/HashTransformKeys:
93
100
  Enabled: true
94
101
 
data/.ruby-version CHANGED
@@ -1 +1 @@
1
- ruby-2.5.1
1
+ ruby-3.0.3
data/.travis.yml CHANGED
@@ -9,6 +9,7 @@ before_script:
9
9
  - curl -L https://codeclimate.com/downloads/test-reporter/test-reporter-latest-linux-amd64 > ./cc-test-reporter
10
10
  - chmod +x ./cc-test-reporter
11
11
  - ./cc-test-reporter before-build
12
+ - bundle exec rake compile
12
13
  script:
13
14
  - bundle exec rspec
14
15
  after_script:
data/Rakefile CHANGED
@@ -1,4 +1,5 @@
1
1
  require "bundler/gem_tasks"
2
+ require "rake/extensiontask"
2
3
  require "rspec/core/rake_task"
3
4
 
4
5
  RSpec::Core::RakeTask.new(:spec)
@@ -10,3 +11,7 @@ RDoc::Task.new do |rdoc|
10
11
  rdoc.main = "README.md"
11
12
  rdoc.rdoc_files.include("README.md", "lib/**/*.rb")
12
13
  end
14
+
15
+ Rake::ExtensionTask.new('text_rank') do |ext|
16
+ ext.lib_dir = 'lib/text_rank'
17
+ end
data/bin/console CHANGED
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require "bundler/setup"
4
- require "text_rank"
3
+ require 'bundler/setup'
4
+ require 'text_rank'
5
5
 
6
6
  # You can add fixtures and/or initialization code here to make experimenting
7
7
  # with your gem easier. You can also use a different console, if you like.
@@ -10,5 +10,5 @@ require "text_rank"
10
10
  # require "pry"
11
11
  # Pry.start
12
12
 
13
- require "irb"
13
+ require 'irb'
14
14
  IRB.start
@@ -0,0 +1,3 @@
1
+ require "mkmf"
2
+
3
+ create_makefile('text_rank/text_rank')
@@ -0,0 +1,300 @@
1
+ #include <ruby.h>
2
+ #include <math.h>
3
+ #include <page_rank_sparse_native.h>
4
+
5
+ const size_t NODE_LIST_SIZE = sizeof(NodeListStruct);
6
+ const size_t EDGE_LIST_SIZE = sizeof(EdgeListStruct);
7
+ const size_t NODE_SIZE = sizeof(NodeStruct);
8
+ const size_t EDGE_SIZE = sizeof(EdgeStruct);
9
+ const size_t GRAPH_SIZE = sizeof(GraphStruct);
10
+
11
+ static const rb_data_type_t graph_typed_data = {
12
+ "PageRank/SparseNative/Graph",
13
+ { 0, free_graph, },
14
+ 0, 0,
15
+ RUBY_TYPED_FREE_IMMEDIATELY,
16
+ };
17
+
18
+
19
+ //////////////////////////////////////////////////////////////////////////////////////
20
+
21
+ void Init_sparse_native() {
22
+ VALUE PageRankModule, SparseNativeClass;
23
+
24
+ PageRankModule = rb_const_get(rb_cObject, rb_intern("PageRank"));
25
+ SparseNativeClass = rb_const_get(PageRankModule, rb_intern("SparseNative"));
26
+
27
+ rb_define_alloc_func(SparseNativeClass, sparse_native_allocate);
28
+ rb_define_private_method(SparseNativeClass, "_add_edge", sparse_native_add_edge, 3);
29
+ rb_define_private_method(SparseNativeClass, "_calculate", sparse_native_calculate, 3);
30
+ }
31
+
32
+ VALUE sparse_native_allocate(VALUE self) {
33
+ Graph g = malloc(GRAPH_SIZE);
34
+
35
+ // Grab a reference to the hash type used by a generic Ruby {}
36
+ // which accepts any key and any value. We'll need this type to create
37
+ // a st_table in which to put arbitrary VALUE keys. This hash type
38
+ // should be a static constant and thus should be safe to utilize without
39
+ // fear of garbage collection.
40
+ const struct st_hash_type *objhash = rb_hash_tbl(rb_hash_new(), "page_rank_sparse_native.c", 40)->type;
41
+
42
+ g->node_count = 0;
43
+ g->nodes = NULL;
44
+ g->dangling_nodes = NULL;
45
+ g->node_lookup = st_init_table_with_size(objhash, 0);
46
+
47
+ return TypedData_Wrap_Struct(self, &graph_typed_data, g);
48
+ }
49
+
50
+ VALUE sparse_native_add_edge(VALUE self, VALUE source, VALUE dest, VALUE weight) {
51
+ Graph g;
52
+
53
+ TypedData_Get_Struct(self, GraphStruct, &graph_typed_data, g);
54
+ add_edge_with_labels(g, source, dest, NUM2DBL(weight));
55
+ return Qnil;
56
+ }
57
+
58
+ VALUE sparse_native_calculate(VALUE self, VALUE max_iterations, VALUE damping, VALUE tolerance) {
59
+ Graph g;
60
+ VALUE ranks;
61
+
62
+ TypedData_Get_Struct(self, GraphStruct, &graph_typed_data, g);
63
+ calculate(g, FIX2INT(max_iterations), NUM2DBL(damping), NUM2DBL(tolerance));
64
+
65
+ ranks = rb_hash_new();
66
+ sort_and_normalize_ranks(g, rb_hash_dset, ranks);
67
+ return ranks;
68
+ }
69
+
70
+ void rb_hash_dset(VALUE hash, VALUE key, double value) {
71
+ rb_hash_aset(hash, key, DBL2NUM(value));
72
+ }
73
+
74
+ //////////////////////////////////////////////////////////////////////////////////////
75
+
76
+ void free_graph(void *data) {
77
+ Graph g = (Graph)data;
78
+ free_node_list(g->nodes, free_node);
79
+ free_node_list(g->dangling_nodes, NULL);
80
+ free(g->node_lookup);
81
+ free(g);
82
+ }
83
+
84
+ void free_node(Node n) {
85
+ free_edge_list(n->source_edges, free_edge);
86
+ free(n);
87
+ }
88
+
89
+ void free_node_list(NodeList nodes, void (*free_item)(Node)) {
90
+ NodeList tmp;
91
+ while (nodes != NULL) {
92
+ tmp = nodes;
93
+ nodes = nodes->next;
94
+ if (free_item) {
95
+ free_item(tmp->node);
96
+ }
97
+ free(tmp);
98
+ }
99
+ }
100
+
101
+ void free_edge(Edge e) {
102
+ // Assume source node was allocated elsewhere and will be free'd elsewhere
103
+ free(e);
104
+ }
105
+
106
+ void free_edge_list(EdgeList edges, void (*free_item)(Edge)) {
107
+ EdgeList tmp;
108
+ while (edges != NULL) {
109
+ tmp = edges;
110
+ edges = edges->next;
111
+ if (free_item) {
112
+ free_item(tmp->edge);
113
+ }
114
+ free(tmp);
115
+ }
116
+ }
117
+
118
+ //////////////////////////////////////////////////////////////////////////////////////
119
+
120
+ Node add_node(Graph g, VALUE label) {
121
+ NodeList tmp = malloc(NODE_LIST_SIZE);
122
+
123
+ tmp->node = malloc(NODE_SIZE);
124
+ tmp->node->label = label;
125
+ tmp->node->source_edges = NULL;
126
+ tmp->node->rank = 0.0;
127
+ tmp->node->prev_rank = 0.0;
128
+ tmp->node->outbound_weight_total = 0.0;
129
+
130
+ tmp->next = g->nodes;
131
+ g->nodes = tmp;
132
+ g->node_count += 1;
133
+
134
+ return tmp->node;
135
+ }
136
+
137
+ Node add_dangling_node(Graph g, Node n) {
138
+ NodeList tmp = malloc(NODE_LIST_SIZE);
139
+
140
+ tmp->node = n;
141
+ tmp->next = g->dangling_nodes;
142
+ g->dangling_nodes = tmp;
143
+
144
+ return n;
145
+ }
146
+
147
+ Edge add_edge(Node source, Node destination, double weight) {
148
+ EdgeList tmp = malloc(EDGE_LIST_SIZE);
149
+
150
+ tmp->edge = malloc(EDGE_SIZE);
151
+ tmp->edge->source = source;
152
+ tmp->edge->weight = weight;
153
+
154
+ tmp->next = destination->source_edges;
155
+ destination->source_edges = tmp;
156
+ source->outbound_weight_total += weight;
157
+
158
+ return tmp->edge;
159
+ }
160
+
161
+ Edge add_edge_with_labels(Graph g, VALUE source_label, VALUE dest_label, double weight) {
162
+ Node source, dest;
163
+
164
+ source = lookup_node(g, source_label);
165
+ dest = lookup_node(g, dest_label);
166
+
167
+ return add_edge(source, dest, weight);
168
+ }
169
+
170
+ Node lookup_node(Graph g, VALUE label) {
171
+ Node n;
172
+
173
+ if (!st_lookup(g->node_lookup, (st_data_t)label, (st_data_t *)&n)) {
174
+ n = add_node(g, label);
175
+ st_add_direct(g->node_lookup, (st_data_t)label, (st_data_t)n);
176
+ }
177
+ return n;
178
+ }
179
+
180
+ //////////////////////////////////////////////////////////////////////////////////////
181
+
182
+ void calculate_start(Graph g) {
183
+ NodeList nodes;
184
+ Node source, destination;
185
+ EdgeList edges;
186
+ Edge e;
187
+
188
+ for (nodes = g->nodes; nodes != NULL; nodes = nodes->next) {
189
+ destination = nodes->node;
190
+
191
+ // If there is no outband, this is a "dangling" node
192
+ if (destination->outbound_weight_total == 0.0) {
193
+ add_dangling_node(g, destination);
194
+ }
195
+
196
+ // Normalize all source edge weights
197
+ for (edges = destination->source_edges; edges != NULL; edges = edges->next) {
198
+ e = edges->edge;
199
+ source = e->source;
200
+ e->weight = e->weight / source->outbound_weight_total;
201
+ }
202
+
203
+ // Set the initial rank
204
+ destination->prev_rank = 0;
205
+ destination->rank = 1.0 / g->node_count;
206
+ }
207
+ }
208
+
209
+ void calculate_step(Graph g, double damping) {
210
+ NodeList nodes, dangling_nodes;
211
+ Node source, destination;
212
+ EdgeList edges;
213
+ Edge e;
214
+ double sum;
215
+
216
+ // Set prev rank to rank for all nodes
217
+ for (nodes = g->nodes; nodes != NULL; nodes = nodes->next) {
218
+ destination = nodes->node;
219
+ destination->prev_rank = destination->rank;
220
+ }
221
+
222
+ // Re-destribute the rankings according to weight
223
+ for (nodes = g->nodes; nodes != NULL; nodes = nodes->next) {
224
+ destination = nodes->node;
225
+ sum = 0.0;
226
+ for (edges = destination->source_edges; edges != NULL; edges = edges->next) {
227
+ e = edges->edge;
228
+ source = e->source;
229
+ sum += source->prev_rank * e->weight;
230
+ }
231
+ for (dangling_nodes = g->dangling_nodes; dangling_nodes != NULL; dangling_nodes = dangling_nodes->next) {
232
+ source = dangling_nodes->node;
233
+ sum += source->prev_rank / g->node_count;
234
+ }
235
+ destination->rank = damping * sum + (1 - damping) / g->node_count;
236
+ }
237
+ }
238
+
239
+ // Calculate the Euclidean distance from prev_rank to rank across all nodes
240
+ double prev_distance(Graph g) {
241
+ NodeList nodes;
242
+ Node n;
243
+ double rank_diff, sum_squares = 0.0;
244
+
245
+ for (nodes = g->nodes; nodes != NULL; nodes = nodes->next) {
246
+ n = nodes->node;
247
+ rank_diff = n->prev_rank - n->rank;
248
+ sum_squares += rank_diff * rank_diff;
249
+ }
250
+
251
+ return sqrt(sum_squares);
252
+ }
253
+
254
+ void calculate(Graph g, int max_iterations, double damping, double tolerance) {
255
+ calculate_start(g);
256
+
257
+ while (max_iterations != 0) { // If negative one, allow to go without limit
258
+ calculate_step(g, damping);
259
+ if (prev_distance(g) < tolerance) {
260
+ break;
261
+ }
262
+ max_iterations--;
263
+ }
264
+ }
265
+
266
+ int node_compare(const void *v1, const void *v2) {
267
+ double rank1, rank2, cmp;
268
+
269
+ rank1 = (*(Node *)v1)->rank;
270
+ rank2 = (*(Node *)v2)->rank;
271
+ cmp = rank2 - rank1; // Decreasing order
272
+ if (cmp < 0) return -1;
273
+ if (cmp > 0) return 1;
274
+ return 0;
275
+ }
276
+
277
+ void sort_and_normalize_ranks(Graph g, void (*callback)(VALUE, VALUE, double), VALUE callback_arg) {
278
+ NodeList nodes;
279
+ Node n;
280
+ double sum = 0.0;
281
+ unsigned long i;
282
+ Node *tmp;
283
+
284
+ i = g->node_count;
285
+ tmp = malloc(g->node_count * sizeof(Node));
286
+ for (nodes = g->nodes; nodes != NULL; nodes = nodes->next) {
287
+ n = nodes->node;
288
+ tmp[--i] = n;
289
+ sum += n->rank;
290
+ }
291
+
292
+ qsort(tmp, g->node_count, sizeof(Node), node_compare);
293
+
294
+ for (i = 0; i < g->node_count; i++) {
295
+ n = tmp[i];
296
+ callback(callback_arg, n->label, n->rank / sum);
297
+ }
298
+
299
+ free(tmp);
300
+ }
@@ -0,0 +1,93 @@
1
+ #ifndef PAGE_RANK_SPARSE_NATIVE_H
2
+ #define PAGE_RANK_SPARSE_NATIVE_H
3
+
4
+ #include <ruby.h>
5
+
6
+ struct NodeListStruct;
7
+ typedef struct NodeListStruct* NodeList;
8
+
9
+ typedef struct NodeListStruct {
10
+ struct NodeStruct *node;
11
+ struct NodeListStruct *next;
12
+ } NodeListStruct;
13
+
14
+ //////////////////////////////////////////////////////////////////////////////////////
15
+
16
+ struct EdgeListStruct;
17
+ typedef struct EdgeListStruct* EdgeList;
18
+
19
+ typedef struct EdgeListStruct {
20
+ struct EdgeStruct *edge;
21
+ struct EdgeListStruct *next;
22
+ } EdgeListStruct;
23
+
24
+ //////////////////////////////////////////////////////////////////////////////////////
25
+
26
+ struct NodeStruct;
27
+ typedef struct NodeStruct* Node;
28
+
29
+ typedef struct NodeStruct {
30
+ EdgeList source_edges;
31
+ VALUE label;
32
+ double prev_rank;
33
+ double rank;
34
+ double outbound_weight_total;
35
+ } NodeStruct;
36
+
37
+ //////////////////////////////////////////////////////////////////////////////////////
38
+
39
+ struct EdgeStruct;
40
+ typedef struct EdgeStruct* Edge;
41
+
42
+ typedef struct EdgeStruct {
43
+ Node source;
44
+ double weight;
45
+ } EdgeStruct;
46
+
47
+ //////////////////////////////////////////////////////////////////////////////////////
48
+
49
+ struct GraphStruct;
50
+ typedef struct GraphStruct* Graph;
51
+
52
+ typedef struct GraphStruct {
53
+ unsigned long node_count;
54
+ NodeList nodes;
55
+ NodeList dangling_nodes;
56
+ st_table *node_lookup;
57
+ } GraphStruct;
58
+
59
+ //////////////////////////////////////////////////////////////////////////////////////
60
+
61
+ void free_graph(void *data);
62
+ void free_node(Node n);
63
+ void free_node_list(NodeList nodes, void (*free_item)(Node));
64
+ void free_edge(Edge e);
65
+ void free_edge_list(EdgeList edges, void (*free_item)(Edge));
66
+
67
+ //////////////////////////////////////////////////////////////////////////////////////
68
+
69
+ Node add_node(Graph g, VALUE label);
70
+ Node add_dangling_node(Graph g, Node n);
71
+ Edge add_edge(Node source, Node destination, double weight);
72
+ Edge add_edge_with_labels(Graph g, VALUE source_label, VALUE dest_label, double weight);
73
+ Node lookup_node(Graph g, VALUE label);
74
+
75
+ //////////////////////////////////////////////////////////////////////////////////////
76
+
77
+ void calculate_start(Graph g);
78
+ void calculate_step(Graph g, double damping);
79
+ double prev_distance(Graph g);
80
+ void calculate(Graph g, int max_iterations, double damping, double tolerance);
81
+ int node_compare(const void *v1, const void *v2);
82
+ void sort_and_normalize_ranks(Graph g, void (*callback)(VALUE, VALUE, double), VALUE callback_arg);
83
+
84
+ //////////////////////////////////////////////////////////////////////////////////////
85
+
86
+ void Init_sparse_native();
87
+ VALUE sparse_native_allocate(VALUE self);
88
+ VALUE sparse_native_add_edge(VALUE self, VALUE source, VALUE dest, VALUE weight);
89
+ VALUE sparse_native_calculate(VALUE self, VALUE max_iterations, VALUE damping, VALUE tolerance);
90
+ VALUE sorted_and_normalized_ranks(Graph g);
91
+ void rb_hash_dset(VALUE hash, VALUE key, double value);
92
+
93
+ #endif
@@ -0,0 +1,5 @@
1
+ #include <page_rank_sparse_native.h>
2
+
3
+ void Init_text_rank() {
4
+ Init_sparse_native();
5
+ }
@@ -7,6 +7,8 @@ module PageRank
7
7
  ##
8
8
  class Base
9
9
 
10
+ attr_reader :damping, :tolerance
11
+
10
12
  # @param (see #damping=)
11
13
  # @param (see #tolerance=)
12
14
  def initialize(damping: nil, tolerance: nil, **_)
@@ -19,8 +21,7 @@ module PageRank
19
21
  # @return [Float]
20
22
  def damping=(damping)
21
23
  @damping = damping || 0.85
22
- raise ArgumentError.new('Invalid damping factor') if @damping <= 0 || @damping > 1
23
- @damping
24
+ raise ArgumentError, 'Invalid damping factor' if @damping <= 0 || @damping > 1
24
25
  end
25
26
 
26
27
  # Set the tolerance value
@@ -28,8 +29,7 @@ module PageRank
28
29
  # @return [Float]
29
30
  def tolerance=(tolerance)
30
31
  @tolerance = tolerance || 0.0001
31
- raise ArgumentError.new('Invalid tolerance factor') if @tolerance < 0 || @tolerance > 1
32
- @tolerance
32
+ raise ArgumentError, 'Invalid tolerance factor' if @tolerance.negative? || @tolerance > 1
33
33
  end
34
34
 
35
35
  # Adds a directed (and optionally weighted) edge to the graph
@@ -46,9 +46,12 @@ module PageRank
46
46
  def calculate(max_iterations: -1, **_)
47
47
  ranks = initial_ranks
48
48
  loop do
49
- break if max_iterations == 0
50
- ranks, prev_ranks = calculate_step(ranks), ranks
51
- break if distance(ranks, prev_ranks) < @tolerance
49
+ break if max_iterations.zero?
50
+
51
+ prev_ranks = ranks
52
+ ranks = calculate_step(ranks)
53
+ break if distance(ranks, prev_ranks) < tolerance
54
+
52
55
  max_iterations -= 1
53
56
  end
54
57
  sort_ranks(ranks)
@@ -77,9 +80,9 @@ module PageRank
77
80
  end
78
81
 
79
82
  # Calculate the Euclidean distance from one ranking to the next iteration
80
- def distance(v1, v2)
83
+ def distance(vector1, vector2)
81
84
  sum_squares = node_count.times.reduce(0.0) do |sum, i|
82
- d = v1[i] - v2[i]
85
+ d = vector1[i] - vector2[i]
83
86
  sum + d * d
84
87
  end
85
88
  Math.sqrt(sum_squares)
@@ -32,6 +32,7 @@ module PageRank
32
32
  # @return (see Base#add)
33
33
  def add(source, dest, weight: 1.0)
34
34
  return if source == dest
35
+
35
36
  source_idx = index(source)
36
37
  dest_idx = index(dest)
37
38
  @out_links[source_idx] ||= []
@@ -72,13 +73,13 @@ module PageRank
72
73
 
73
74
  def to_matrix
74
75
  total_out_weights = @out_links.map do |links|
75
- links.compact.reduce(:+) if links
76
+ links&.compact&.reduce(:+)
76
77
  end
77
78
  Matrix.build(node_count, node_count) do |dest_idx, source_idx|
78
79
  total = total_out_weights[source_idx]
79
80
  if total
80
81
  w = @out_links[source_idx][dest_idx] || 0.0
81
- @damping * w / total + (1 - @damping) / node_count.to_f
82
+ damping * w / total + (1 - damping) / node_count.to_f
82
83
  else
83
84
  1.0 / node_count.to_f
84
85
  end
@@ -1,5 +1,3 @@
1
- require 'set'
2
-
3
1
  module PageRank
4
2
  ##
5
3
  # Implementation of PageRank using a sparse matrix representation of the graph
@@ -33,6 +31,7 @@ module PageRank
33
31
  # @return (see Base#add)
34
32
  def add(source, dest, weight: 1.0)
35
33
  return false if source == dest
34
+
36
35
  @graph[dest] ||= Set.new
37
36
  @graph[dest] << source
38
37
  @weights[source] ||= Hash.new(0.0)
@@ -53,8 +52,8 @@ module PageRank
53
52
  def initial_ranks
54
53
  @dangling_nodes = @nodes - @weight_totals.keys
55
54
  @normalized_weights = @weights.each_with_object({}) do |(source, values), h|
56
- h[source] = values.each_with_object({}) do |(dest, w), h2|
57
- h2[dest] = w / @weight_totals[source]
55
+ h[source] = values.transform_values do |w|
56
+ w / @weight_totals[source]
58
57
  end
59
58
  end
60
59
  Hash[@nodes.map { |k| [k, 1.0 / node_count.to_f] }]
@@ -69,7 +68,7 @@ module PageRank
69
68
  @dangling_nodes.each do |source|
70
69
  sum += ranks[source] / node_count.to_f
71
70
  end
72
- new_ranks[dest] = @damping * sum + (1 - @damping)/node_count
71
+ new_ranks[dest] = damping * sum + (1 - damping) / node_count
73
72
  end
74
73
  end
75
74
 
@@ -79,8 +78,8 @@ module PageRank
79
78
  Hash[ranks.map { |k, v| [k, v / sum] }.sort_by { |_, v| -v }]
80
79
  end
81
80
 
82
- def distance(v1, v2)
83
- super(v1.values.to_a, v2.values.to_a)
81
+ def distance(vector1, vector2)
82
+ super(vector1.values.to_a, vector2.values.to_a)
84
83
  end
85
84
 
86
85
  end
@@ -0,0 +1,21 @@
1
+ module PageRank
2
+ class SparseNative < Base
3
+
4
+ #require 'page_rank/sparse_native.so'
5
+
6
+ # @param (see Base#add)
7
+ # @param weight [Float] Optional weight for the graph edge
8
+ # @return (see Base#add)
9
+ def add(source, dest, weight: 1.0)
10
+ _add_edge(source, dest, weight) unless source == dest
11
+ end
12
+
13
+ # Perform the PageRank calculation
14
+ # @param max_iterations [Fixnum] Maximum number of PageRank iterations to perform (or -1 for no max)
15
+ # @return [Hash<Object, Float>] of nodes with rank
16
+ def calculate(max_iterations: -1, **_)
17
+ _calculate(max_iterations, damping, tolerance)
18
+ end
19
+
20
+ end
21
+ end
data/lib/page_rank.rb CHANGED
@@ -1,3 +1,5 @@
1
+ require 'set'
2
+
1
3
  ##
2
4
  # A module for supporting Ruby implementations of PageRank. Rather than rely on
3
5
  # one single implementation, this module allows for multiple implementations that
@@ -15,16 +17,17 @@
15
17
  ##
16
18
  module PageRank
17
19
 
18
- autoload :Base, 'page_rank/base'
19
- autoload :Dense, 'page_rank/dense'
20
- autoload :Sparse, 'page_rank/sparse'
20
+ autoload :Base, 'page_rank/base'
21
+ autoload :Dense, 'page_rank/dense'
22
+ autoload :Sparse, 'page_rank/sparse'
23
+ autoload :SparseNative, 'page_rank/sparse_native'
21
24
 
22
25
  # @option options [Symbol] :strategy PageRank strategy to use (either :sparse or :dense)
23
26
  # @option options [Float] :damping The probability of following the graph vs. randomly choosing a new node
24
27
  # @option options [Float] :tolerance The desired accuracy of the results
25
28
  # @return [PageRank::Base]
26
29
  def self.new(strategy: :sparse, **options)
27
- const_get(strategy.to_s.capitalize).new(**options)
30
+ const_get(strategy.to_s.split('_').map(&:capitalize).join).new(**options)
28
31
  end
29
32
 
30
33
  # Convenience method to quickly calculate PageRank. In the calling block, graph edges can be added.