text_rank 1.2.3 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. checksums.yaml +4 -4
  2. data/.codeclimate.yml +1 -1
  3. data/.gitignore +4 -0
  4. data/.rubocop.yml +7 -0
  5. data/.ruby-version +1 -1
  6. data/.travis.yml +1 -0
  7. data/Rakefile +5 -0
  8. data/bin/console +3 -3
  9. data/ext/text_rank/extconf.rb +3 -0
  10. data/ext/text_rank/page_rank_sparse_native.c +300 -0
  11. data/ext/text_rank/page_rank_sparse_native.h +93 -0
  12. data/ext/text_rank/text_rank.c +5 -0
  13. data/lib/page_rank/base.rb +12 -9
  14. data/lib/page_rank/dense.rb +3 -2
  15. data/lib/page_rank/sparse.rb +6 -7
  16. data/lib/page_rank/sparse_native.rb +21 -0
  17. data/lib/page_rank.rb +7 -4
  18. data/lib/text_rank/char_filter/ascii_folding.rb +5 -1
  19. data/lib/text_rank/char_filter/strip_possessive.rb +2 -2
  20. data/lib/text_rank/char_filter/undo_contractions.rb +1 -137
  21. data/lib/text_rank/char_filter/undo_contractions.yml +135 -0
  22. data/lib/text_rank/char_filter.rb +1 -1
  23. data/lib/text_rank/fingerprint.rb +10 -18
  24. data/lib/text_rank/fingerprint_overlap.rb +55 -0
  25. data/lib/text_rank/graph_strategy/coocurrence.rb +15 -6
  26. data/lib/text_rank/keyword_extractor.rb +32 -25
  27. data/lib/text_rank/rank_filter/collapse_adjacent.rb +48 -25
  28. data/lib/text_rank/rank_filter/normalize_probability.rb +2 -1
  29. data/lib/text_rank/rank_filter/normalize_unit_vector.rb +2 -1
  30. data/lib/text_rank/token_filter/part_of_speech.rb +0 -1
  31. data/lib/text_rank/token_filter/stopwords.rb +1 -321
  32. data/lib/text_rank/token_filter/stopwords.yml +317 -0
  33. data/lib/text_rank/tokenizer/money.rb +11 -6
  34. data/lib/text_rank/tokenizer/number.rb +4 -3
  35. data/lib/text_rank/tokenizer/punctuation.rb +4 -1
  36. data/lib/text_rank/tokenizer/url.rb +3 -0
  37. data/lib/text_rank/tokenizer/whitespace.rb +4 -1
  38. data/lib/text_rank/tokenizer/word.rb +5 -2
  39. data/lib/text_rank/tokenizer.rb +1 -1
  40. data/lib/text_rank/version.rb +3 -1
  41. data/lib/text_rank.rb +14 -9
  42. data/text_rank.gemspec +4 -1
  43. metadata +48 -12
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e1daece496ee8bb075ecc3540d6b8ca64d45c740b1ca34529e094283933d9c97
4
- data.tar.gz: bee8ff77c21cffa95d838e91ae09773b290afd4b9c0224415c2d783f63069b1b
3
+ metadata.gz: 5b25779e7c013e9d0c1d867324f58d40a062bacae0e38f92714e1d3fd7b0e7ef
4
+ data.tar.gz: 34c36b8ff6673092b2463b9f4e0fdaf94a55e50c3e52e4aeec125775c7fa3a9d
5
5
  SHA512:
6
- metadata.gz: 0e9df6c07d6c8bb94a782b61c7877e7fe2e4dd064645c87a0e636bae2236ffbf92e848426d5d8cb4daa04328a6afc25c0a58de3c14ba316c84bb07caa45801f8
7
- data.tar.gz: e7a10407dce5651a05aa208db1136a3c0cc1082c70152b676d3867408cc8107dc8543080e20b8855659b8b58c4fe0063321fc5e8b976933372ff02d6b597ca82
6
+ metadata.gz: 6f03e71745ed96077c63ed376303fcfaa8683f960319d71a405b943aa4a23383938c914b33c867f76f4d979505aeb1d5a0110b51dcc1eadab14cab41d6ee8697
7
+ data.tar.gz: edb17a0ee101254a5afc7c7ee5b084e11ce1d8bfd5083d5069a5e0841751a1f8f175fa167b788186b881e72a1114fa17a8cddc8c3328e00dda812a89b8647196
data/.codeclimate.yml CHANGED
@@ -16,7 +16,7 @@ engines:
16
16
  - 4218049e28199ed950d3cd721df86dce
17
17
  - c8179d0de3a9df18a2c45750d3f8647e
18
18
  - 03f6eee11d86507da564695007106721
19
- channel: rubocop-0-85
19
+ channel: rubocop-1-23-0
20
20
  ratings:
21
21
  paths:
22
22
  - "**.rb"
data/.gitignore CHANGED
@@ -8,3 +8,7 @@
8
8
  /pkg/
9
9
  /spec/reports/
10
10
  /tmp/
11
+
12
+
13
+ *.bundle
14
+ *.so
data/.rubocop.yml CHANGED
@@ -15,6 +15,10 @@ Layout/EmptyLinesAroundModuleBody:
15
15
  Layout/ExtraSpacing:
16
16
  Enabled: false
17
17
 
18
+ Layout/HashAlignment:
19
+ EnforcedHashRocketStyle: table
20
+ EnforcedColonStyle: table
21
+
18
22
  Layout/LineLength:
19
23
  Max: 120
20
24
  Enabled: false
@@ -89,6 +93,9 @@ Style/GuardClause:
89
93
  Style/HashEachMethods:
90
94
  Enabled: true
91
95
 
96
+ Style/HashSyntax:
97
+ Enabled: true
98
+
92
99
  Style/HashTransformKeys:
93
100
  Enabled: true
94
101
 
data/.ruby-version CHANGED
@@ -1 +1 @@
1
- ruby-2.5.1
1
+ ruby-3.0.3
data/.travis.yml CHANGED
@@ -9,6 +9,7 @@ before_script:
9
9
  - curl -L https://codeclimate.com/downloads/test-reporter/test-reporter-latest-linux-amd64 > ./cc-test-reporter
10
10
  - chmod +x ./cc-test-reporter
11
11
  - ./cc-test-reporter before-build
12
+ - bundle exec rake compile
12
13
  script:
13
14
  - bundle exec rspec
14
15
  after_script:
data/Rakefile CHANGED
@@ -1,4 +1,5 @@
1
1
  require "bundler/gem_tasks"
2
+ require "rake/extensiontask"
2
3
  require "rspec/core/rake_task"
3
4
 
4
5
  RSpec::Core::RakeTask.new(:spec)
@@ -10,3 +11,7 @@ RDoc::Task.new do |rdoc|
10
11
  rdoc.main = "README.md"
11
12
  rdoc.rdoc_files.include("README.md", "lib/**/*.rb")
12
13
  end
14
+
15
+ Rake::ExtensionTask.new('text_rank') do |ext|
16
+ ext.lib_dir = 'lib/text_rank'
17
+ end
data/bin/console CHANGED
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require "bundler/setup"
4
- require "text_rank"
3
+ require 'bundler/setup'
4
+ require 'text_rank'
5
5
 
6
6
  # You can add fixtures and/or initialization code here to make experimenting
7
7
  # with your gem easier. You can also use a different console, if you like.
@@ -10,5 +10,5 @@ require "text_rank"
10
10
  # require "pry"
11
11
  # Pry.start
12
12
 
13
- require "irb"
13
+ require 'irb'
14
14
  IRB.start
@@ -0,0 +1,3 @@
1
+ require "mkmf"
2
+
3
+ create_makefile('text_rank/text_rank')
@@ -0,0 +1,300 @@
1
+ #include <ruby.h>
2
+ #include <math.h>
3
+ #include <page_rank_sparse_native.h>
4
+
5
+ const size_t NODE_LIST_SIZE = sizeof(NodeListStruct);
6
+ const size_t EDGE_LIST_SIZE = sizeof(EdgeListStruct);
7
+ const size_t NODE_SIZE = sizeof(NodeStruct);
8
+ const size_t EDGE_SIZE = sizeof(EdgeStruct);
9
+ const size_t GRAPH_SIZE = sizeof(GraphStruct);
10
+
11
+ static const rb_data_type_t graph_typed_data = {
12
+ "PageRank/SparseNative/Graph",
13
+ { 0, free_graph, },
14
+ 0, 0,
15
+ RUBY_TYPED_FREE_IMMEDIATELY,
16
+ };
17
+
18
+
19
+ //////////////////////////////////////////////////////////////////////////////////////
20
+
21
+ void Init_sparse_native() {
22
+ VALUE PageRankModule, SparseNativeClass;
23
+
24
+ PageRankModule = rb_const_get(rb_cObject, rb_intern("PageRank"));
25
+ SparseNativeClass = rb_const_get(PageRankModule, rb_intern("SparseNative"));
26
+
27
+ rb_define_alloc_func(SparseNativeClass, sparse_native_allocate);
28
+ rb_define_private_method(SparseNativeClass, "_add_edge", sparse_native_add_edge, 3);
29
+ rb_define_private_method(SparseNativeClass, "_calculate", sparse_native_calculate, 3);
30
+ }
31
+
32
+ VALUE sparse_native_allocate(VALUE self) {
33
+ Graph g = malloc(GRAPH_SIZE);
34
+
35
+ // Grab a reference to the hash type used by a generic Ruby {}
36
+ // which accepts any key and any value. We'll need this type to create
37
+ // a st_table in which to put arbitrary VALUE keys. This hash type
38
+ // should be a static constant and thus should be safe to utilize without
39
+ // fear of garbage collection.
40
+ const struct st_hash_type *objhash = rb_hash_tbl(rb_hash_new(), "page_rank_sparse_native.c", 40)->type;
41
+
42
+ g->node_count = 0;
43
+ g->nodes = NULL;
44
+ g->dangling_nodes = NULL;
45
+ g->node_lookup = st_init_table_with_size(objhash, 0);
46
+
47
+ return TypedData_Wrap_Struct(self, &graph_typed_data, g);
48
+ }
49
+
50
+ VALUE sparse_native_add_edge(VALUE self, VALUE source, VALUE dest, VALUE weight) {
51
+ Graph g;
52
+
53
+ TypedData_Get_Struct(self, GraphStruct, &graph_typed_data, g);
54
+ add_edge_with_labels(g, source, dest, NUM2DBL(weight));
55
+ return Qnil;
56
+ }
57
+
58
+ VALUE sparse_native_calculate(VALUE self, VALUE max_iterations, VALUE damping, VALUE tolerance) {
59
+ Graph g;
60
+ VALUE ranks;
61
+
62
+ TypedData_Get_Struct(self, GraphStruct, &graph_typed_data, g);
63
+ calculate(g, FIX2INT(max_iterations), NUM2DBL(damping), NUM2DBL(tolerance));
64
+
65
+ ranks = rb_hash_new();
66
+ sort_and_normalize_ranks(g, rb_hash_dset, ranks);
67
+ return ranks;
68
+ }
69
+
70
+ void rb_hash_dset(VALUE hash, VALUE key, double value) {
71
+ rb_hash_aset(hash, key, DBL2NUM(value));
72
+ }
73
+
74
+ //////////////////////////////////////////////////////////////////////////////////////
75
+
76
+ void free_graph(void *data) {
77
+ Graph g = (Graph)data;
78
+ free_node_list(g->nodes, free_node);
79
+ free_node_list(g->dangling_nodes, NULL);
80
+ free(g->node_lookup);
81
+ free(g);
82
+ }
83
+
84
+ void free_node(Node n) {
85
+ free_edge_list(n->source_edges, free_edge);
86
+ free(n);
87
+ }
88
+
89
+ void free_node_list(NodeList nodes, void (*free_item)(Node)) {
90
+ NodeList tmp;
91
+ while (nodes != NULL) {
92
+ tmp = nodes;
93
+ nodes = nodes->next;
94
+ if (free_item) {
95
+ free_item(tmp->node);
96
+ }
97
+ free(tmp);
98
+ }
99
+ }
100
+
101
+ void free_edge(Edge e) {
102
+ // Assume source node was allocated elsewhere and will be free'd elsewhere
103
+ free(e);
104
+ }
105
+
106
+ void free_edge_list(EdgeList edges, void (*free_item)(Edge)) {
107
+ EdgeList tmp;
108
+ while (edges != NULL) {
109
+ tmp = edges;
110
+ edges = edges->next;
111
+ if (free_item) {
112
+ free_item(tmp->edge);
113
+ }
114
+ free(tmp);
115
+ }
116
+ }
117
+
118
+ //////////////////////////////////////////////////////////////////////////////////////
119
+
120
+ Node add_node(Graph g, VALUE label) {
121
+ NodeList tmp = malloc(NODE_LIST_SIZE);
122
+
123
+ tmp->node = malloc(NODE_SIZE);
124
+ tmp->node->label = label;
125
+ tmp->node->source_edges = NULL;
126
+ tmp->node->rank = 0.0;
127
+ tmp->node->prev_rank = 0.0;
128
+ tmp->node->outbound_weight_total = 0.0;
129
+
130
+ tmp->next = g->nodes;
131
+ g->nodes = tmp;
132
+ g->node_count += 1;
133
+
134
+ return tmp->node;
135
+ }
136
+
137
+ Node add_dangling_node(Graph g, Node n) {
138
+ NodeList tmp = malloc(NODE_LIST_SIZE);
139
+
140
+ tmp->node = n;
141
+ tmp->next = g->dangling_nodes;
142
+ g->dangling_nodes = tmp;
143
+
144
+ return n;
145
+ }
146
+
147
+ Edge add_edge(Node source, Node destination, double weight) {
148
+ EdgeList tmp = malloc(EDGE_LIST_SIZE);
149
+
150
+ tmp->edge = malloc(EDGE_SIZE);
151
+ tmp->edge->source = source;
152
+ tmp->edge->weight = weight;
153
+
154
+ tmp->next = destination->source_edges;
155
+ destination->source_edges = tmp;
156
+ source->outbound_weight_total += weight;
157
+
158
+ return tmp->edge;
159
+ }
160
+
161
+ Edge add_edge_with_labels(Graph g, VALUE source_label, VALUE dest_label, double weight) {
162
+ Node source, dest;
163
+
164
+ source = lookup_node(g, source_label);
165
+ dest = lookup_node(g, dest_label);
166
+
167
+ return add_edge(source, dest, weight);
168
+ }
169
+
170
+ Node lookup_node(Graph g, VALUE label) {
171
+ Node n;
172
+
173
+ if (!st_lookup(g->node_lookup, (st_data_t)label, (st_data_t *)&n)) {
174
+ n = add_node(g, label);
175
+ st_add_direct(g->node_lookup, (st_data_t)label, (st_data_t)n);
176
+ }
177
+ return n;
178
+ }
179
+
180
+ //////////////////////////////////////////////////////////////////////////////////////
181
+
182
+ void calculate_start(Graph g) {
183
+ NodeList nodes;
184
+ Node source, destination;
185
+ EdgeList edges;
186
+ Edge e;
187
+
188
+ for (nodes = g->nodes; nodes != NULL; nodes = nodes->next) {
189
+ destination = nodes->node;
190
+
191
+ // If there is no outband, this is a "dangling" node
192
+ if (destination->outbound_weight_total == 0.0) {
193
+ add_dangling_node(g, destination);
194
+ }
195
+
196
+ // Normalize all source edge weights
197
+ for (edges = destination->source_edges; edges != NULL; edges = edges->next) {
198
+ e = edges->edge;
199
+ source = e->source;
200
+ e->weight = e->weight / source->outbound_weight_total;
201
+ }
202
+
203
+ // Set the initial rank
204
+ destination->prev_rank = 0;
205
+ destination->rank = 1.0 / g->node_count;
206
+ }
207
+ }
208
+
209
+ void calculate_step(Graph g, double damping) {
210
+ NodeList nodes, dangling_nodes;
211
+ Node source, destination;
212
+ EdgeList edges;
213
+ Edge e;
214
+ double sum;
215
+
216
+ // Set prev rank to rank for all nodes
217
+ for (nodes = g->nodes; nodes != NULL; nodes = nodes->next) {
218
+ destination = nodes->node;
219
+ destination->prev_rank = destination->rank;
220
+ }
221
+
222
+ // Re-destribute the rankings according to weight
223
+ for (nodes = g->nodes; nodes != NULL; nodes = nodes->next) {
224
+ destination = nodes->node;
225
+ sum = 0.0;
226
+ for (edges = destination->source_edges; edges != NULL; edges = edges->next) {
227
+ e = edges->edge;
228
+ source = e->source;
229
+ sum += source->prev_rank * e->weight;
230
+ }
231
+ for (dangling_nodes = g->dangling_nodes; dangling_nodes != NULL; dangling_nodes = dangling_nodes->next) {
232
+ source = dangling_nodes->node;
233
+ sum += source->prev_rank / g->node_count;
234
+ }
235
+ destination->rank = damping * sum + (1 - damping) / g->node_count;
236
+ }
237
+ }
238
+
239
+ // Calculate the Euclidean distance from prev_rank to rank across all nodes
240
+ double prev_distance(Graph g) {
241
+ NodeList nodes;
242
+ Node n;
243
+ double rank_diff, sum_squares = 0.0;
244
+
245
+ for (nodes = g->nodes; nodes != NULL; nodes = nodes->next) {
246
+ n = nodes->node;
247
+ rank_diff = n->prev_rank - n->rank;
248
+ sum_squares += rank_diff * rank_diff;
249
+ }
250
+
251
+ return sqrt(sum_squares);
252
+ }
253
+
254
+ void calculate(Graph g, int max_iterations, double damping, double tolerance) {
255
+ calculate_start(g);
256
+
257
+ while (max_iterations != 0) { // If negative one, allow to go without limit
258
+ calculate_step(g, damping);
259
+ if (prev_distance(g) < tolerance) {
260
+ break;
261
+ }
262
+ max_iterations--;
263
+ }
264
+ }
265
+
266
+ int node_compare(const void *v1, const void *v2) {
267
+ double rank1, rank2, cmp;
268
+
269
+ rank1 = (*(Node *)v1)->rank;
270
+ rank2 = (*(Node *)v2)->rank;
271
+ cmp = rank2 - rank1; // Decreasing order
272
+ if (cmp < 0) return -1;
273
+ if (cmp > 0) return 1;
274
+ return 0;
275
+ }
276
+
277
+ void sort_and_normalize_ranks(Graph g, void (*callback)(VALUE, VALUE, double), VALUE callback_arg) {
278
+ NodeList nodes;
279
+ Node n;
280
+ double sum = 0.0;
281
+ unsigned long i;
282
+ Node *tmp;
283
+
284
+ i = g->node_count;
285
+ tmp = malloc(g->node_count * sizeof(Node));
286
+ for (nodes = g->nodes; nodes != NULL; nodes = nodes->next) {
287
+ n = nodes->node;
288
+ tmp[--i] = n;
289
+ sum += n->rank;
290
+ }
291
+
292
+ qsort(tmp, g->node_count, sizeof(Node), node_compare);
293
+
294
+ for (i = 0; i < g->node_count; i++) {
295
+ n = tmp[i];
296
+ callback(callback_arg, n->label, n->rank / sum);
297
+ }
298
+
299
+ free(tmp);
300
+ }
@@ -0,0 +1,93 @@
1
+ #ifndef PAGE_RANK_SPARSE_NATIVE_H
2
+ #define PAGE_RANK_SPARSE_NATIVE_H
3
+
4
+ #include <ruby.h>
5
+
6
+ struct NodeListStruct;
7
+ typedef struct NodeListStruct* NodeList;
8
+
9
+ typedef struct NodeListStruct {
10
+ struct NodeStruct *node;
11
+ struct NodeListStruct *next;
12
+ } NodeListStruct;
13
+
14
+ //////////////////////////////////////////////////////////////////////////////////////
15
+
16
+ struct EdgeListStruct;
17
+ typedef struct EdgeListStruct* EdgeList;
18
+
19
+ typedef struct EdgeListStruct {
20
+ struct EdgeStruct *edge;
21
+ struct EdgeListStruct *next;
22
+ } EdgeListStruct;
23
+
24
+ //////////////////////////////////////////////////////////////////////////////////////
25
+
26
+ struct NodeStruct;
27
+ typedef struct NodeStruct* Node;
28
+
29
+ typedef struct NodeStruct {
30
+ EdgeList source_edges;
31
+ VALUE label;
32
+ double prev_rank;
33
+ double rank;
34
+ double outbound_weight_total;
35
+ } NodeStruct;
36
+
37
+ //////////////////////////////////////////////////////////////////////////////////////
38
+
39
+ struct EdgeStruct;
40
+ typedef struct EdgeStruct* Edge;
41
+
42
+ typedef struct EdgeStruct {
43
+ Node source;
44
+ double weight;
45
+ } EdgeStruct;
46
+
47
+ //////////////////////////////////////////////////////////////////////////////////////
48
+
49
+ struct GraphStruct;
50
+ typedef struct GraphStruct* Graph;
51
+
52
+ typedef struct GraphStruct {
53
+ unsigned long node_count;
54
+ NodeList nodes;
55
+ NodeList dangling_nodes;
56
+ st_table *node_lookup;
57
+ } GraphStruct;
58
+
59
+ //////////////////////////////////////////////////////////////////////////////////////
60
+
61
+ void free_graph(void *data);
62
+ void free_node(Node n);
63
+ void free_node_list(NodeList nodes, void (*free_item)(Node));
64
+ void free_edge(Edge e);
65
+ void free_edge_list(EdgeList edges, void (*free_item)(Edge));
66
+
67
+ //////////////////////////////////////////////////////////////////////////////////////
68
+
69
+ Node add_node(Graph g, VALUE label);
70
+ Node add_dangling_node(Graph g, Node n);
71
+ Edge add_edge(Node source, Node destination, double weight);
72
+ Edge add_edge_with_labels(Graph g, VALUE source_label, VALUE dest_label, double weight);
73
+ Node lookup_node(Graph g, VALUE label);
74
+
75
+ //////////////////////////////////////////////////////////////////////////////////////
76
+
77
+ void calculate_start(Graph g);
78
+ void calculate_step(Graph g, double damping);
79
+ double prev_distance(Graph g);
80
+ void calculate(Graph g, int max_iterations, double damping, double tolerance);
81
+ int node_compare(const void *v1, const void *v2);
82
+ void sort_and_normalize_ranks(Graph g, void (*callback)(VALUE, VALUE, double), VALUE callback_arg);
83
+
84
+ //////////////////////////////////////////////////////////////////////////////////////
85
+
86
+ void Init_sparse_native();
87
+ VALUE sparse_native_allocate(VALUE self);
88
+ VALUE sparse_native_add_edge(VALUE self, VALUE source, VALUE dest, VALUE weight);
89
+ VALUE sparse_native_calculate(VALUE self, VALUE max_iterations, VALUE damping, VALUE tolerance);
90
+ VALUE sorted_and_normalized_ranks(Graph g);
91
+ void rb_hash_dset(VALUE hash, VALUE key, double value);
92
+
93
+ #endif
@@ -0,0 +1,5 @@
1
+ #include <page_rank_sparse_native.h>
2
+
3
+ void Init_text_rank() {
4
+ Init_sparse_native();
5
+ }
@@ -7,6 +7,8 @@ module PageRank
7
7
  ##
8
8
  class Base
9
9
 
10
+ attr_reader :damping, :tolerance
11
+
10
12
  # @param (see #damping=)
11
13
  # @param (see #tolerance=)
12
14
  def initialize(damping: nil, tolerance: nil, **_)
@@ -19,8 +21,7 @@ module PageRank
19
21
  # @return [Float]
20
22
  def damping=(damping)
21
23
  @damping = damping || 0.85
22
- raise ArgumentError.new('Invalid damping factor') if @damping <= 0 || @damping > 1
23
- @damping
24
+ raise ArgumentError, 'Invalid damping factor' if @damping <= 0 || @damping > 1
24
25
  end
25
26
 
26
27
  # Set the tolerance value
@@ -28,8 +29,7 @@ module PageRank
28
29
  # @return [Float]
29
30
  def tolerance=(tolerance)
30
31
  @tolerance = tolerance || 0.0001
31
- raise ArgumentError.new('Invalid tolerance factor') if @tolerance < 0 || @tolerance > 1
32
- @tolerance
32
+ raise ArgumentError, 'Invalid tolerance factor' if @tolerance.negative? || @tolerance > 1
33
33
  end
34
34
 
35
35
  # Adds a directed (and optionally weighted) edge to the graph
@@ -46,9 +46,12 @@ module PageRank
46
46
  def calculate(max_iterations: -1, **_)
47
47
  ranks = initial_ranks
48
48
  loop do
49
- break if max_iterations == 0
50
- ranks, prev_ranks = calculate_step(ranks), ranks
51
- break if distance(ranks, prev_ranks) < @tolerance
49
+ break if max_iterations.zero?
50
+
51
+ prev_ranks = ranks
52
+ ranks = calculate_step(ranks)
53
+ break if distance(ranks, prev_ranks) < tolerance
54
+
52
55
  max_iterations -= 1
53
56
  end
54
57
  sort_ranks(ranks)
@@ -77,9 +80,9 @@ module PageRank
77
80
  end
78
81
 
79
82
  # Calculate the Euclidean distance from one ranking to the next iteration
80
- def distance(v1, v2)
83
+ def distance(vector1, vector2)
81
84
  sum_squares = node_count.times.reduce(0.0) do |sum, i|
82
- d = v1[i] - v2[i]
85
+ d = vector1[i] - vector2[i]
83
86
  sum + d * d
84
87
  end
85
88
  Math.sqrt(sum_squares)
@@ -32,6 +32,7 @@ module PageRank
32
32
  # @return (see Base#add)
33
33
  def add(source, dest, weight: 1.0)
34
34
  return if source == dest
35
+
35
36
  source_idx = index(source)
36
37
  dest_idx = index(dest)
37
38
  @out_links[source_idx] ||= []
@@ -72,13 +73,13 @@ module PageRank
72
73
 
73
74
  def to_matrix
74
75
  total_out_weights = @out_links.map do |links|
75
- links.compact.reduce(:+) if links
76
+ links&.compact&.reduce(:+)
76
77
  end
77
78
  Matrix.build(node_count, node_count) do |dest_idx, source_idx|
78
79
  total = total_out_weights[source_idx]
79
80
  if total
80
81
  w = @out_links[source_idx][dest_idx] || 0.0
81
- @damping * w / total + (1 - @damping) / node_count.to_f
82
+ damping * w / total + (1 - damping) / node_count.to_f
82
83
  else
83
84
  1.0 / node_count.to_f
84
85
  end
@@ -1,5 +1,3 @@
1
- require 'set'
2
-
3
1
  module PageRank
4
2
  ##
5
3
  # Implementation of PageRank using a sparse matrix representation of the graph
@@ -33,6 +31,7 @@ module PageRank
33
31
  # @return (see Base#add)
34
32
  def add(source, dest, weight: 1.0)
35
33
  return false if source == dest
34
+
36
35
  @graph[dest] ||= Set.new
37
36
  @graph[dest] << source
38
37
  @weights[source] ||= Hash.new(0.0)
@@ -53,8 +52,8 @@ module PageRank
53
52
  def initial_ranks
54
53
  @dangling_nodes = @nodes - @weight_totals.keys
55
54
  @normalized_weights = @weights.each_with_object({}) do |(source, values), h|
56
- h[source] = values.each_with_object({}) do |(dest, w), h2|
57
- h2[dest] = w / @weight_totals[source]
55
+ h[source] = values.transform_values do |w|
56
+ w / @weight_totals[source]
58
57
  end
59
58
  end
60
59
  Hash[@nodes.map { |k| [k, 1.0 / node_count.to_f] }]
@@ -69,7 +68,7 @@ module PageRank
69
68
  @dangling_nodes.each do |source|
70
69
  sum += ranks[source] / node_count.to_f
71
70
  end
72
- new_ranks[dest] = @damping * sum + (1 - @damping)/node_count
71
+ new_ranks[dest] = damping * sum + (1 - damping) / node_count
73
72
  end
74
73
  end
75
74
 
@@ -79,8 +78,8 @@ module PageRank
79
78
  Hash[ranks.map { |k, v| [k, v / sum] }.sort_by { |_, v| -v }]
80
79
  end
81
80
 
82
- def distance(v1, v2)
83
- super(v1.values.to_a, v2.values.to_a)
81
+ def distance(vector1, vector2)
82
+ super(vector1.values.to_a, vector2.values.to_a)
84
83
  end
85
84
 
86
85
  end
@@ -0,0 +1,21 @@
1
+ module PageRank
2
+ class SparseNative < Base
3
+
4
+ #require 'page_rank/sparse_native.so'
5
+
6
+ # @param (see Base#add)
7
+ # @param weight [Float] Optional weight for the graph edge
8
+ # @return (see Base#add)
9
+ def add(source, dest, weight: 1.0)
10
+ _add_edge(source, dest, weight) unless source == dest
11
+ end
12
+
13
+ # Perform the PageRank calculation
14
+ # @param max_iterations [Fixnum] Maximum number of PageRank iterations to perform (or -1 for no max)
15
+ # @return [Hash<Object, Float>] of nodes with rank
16
+ def calculate(max_iterations: -1, **_)
17
+ _calculate(max_iterations, damping, tolerance)
18
+ end
19
+
20
+ end
21
+ end
data/lib/page_rank.rb CHANGED
@@ -1,3 +1,5 @@
1
+ require 'set'
2
+
1
3
  ##
2
4
  # A module for supporting Ruby implementations of PageRank. Rather than rely on
3
5
  # one single implementation, this module allows for multiple implementations that
@@ -15,16 +17,17 @@
15
17
  ##
16
18
  module PageRank
17
19
 
18
- autoload :Base, 'page_rank/base'
19
- autoload :Dense, 'page_rank/dense'
20
- autoload :Sparse, 'page_rank/sparse'
20
+ autoload :Base, 'page_rank/base'
21
+ autoload :Dense, 'page_rank/dense'
22
+ autoload :Sparse, 'page_rank/sparse'
23
+ autoload :SparseNative, 'page_rank/sparse_native'
21
24
 
22
25
  # @option options [Symbol] :strategy PageRank strategy to use (either :sparse or :dense)
23
26
  # @option options [Float] :damping The probability of following the graph vs. randomly choosing a new node
24
27
  # @option options [Float] :tolerance The desired accuracy of the results
25
28
  # @return [PageRank::Base]
26
29
  def self.new(strategy: :sparse, **options)
27
- const_get(strategy.to_s.capitalize).new(**options)
30
+ const_get(strategy.to_s.split('_').map(&:capitalize).join).new(**options)
28
31
  end
29
32
 
30
33
  # Convenience method to quickly calculate PageRank. In the calling block, graph edges can be added.