text_rank 1.2.4 → 1.3.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,293 @@
1
+ #include <ruby.h>
2
+ #include <math.h>
3
+ #include <page_rank_sparse_native.h>
4
+
5
+ const size_t NODE_LIST_SIZE = sizeof(NodeListStruct);
6
+ const size_t EDGE_LIST_SIZE = sizeof(EdgeListStruct);
7
+ const size_t NODE_SIZE = sizeof(NodeStruct);
8
+ const size_t EDGE_SIZE = sizeof(EdgeStruct);
9
+ const size_t GRAPH_SIZE = sizeof(GraphStruct);
10
+
11
+ static const rb_data_type_t graph_typed_data = {
12
+ "PageRank/SparseNative/Graph",
13
+ { 0, free_graph, },
14
+ 0, 0,
15
+ RUBY_TYPED_FREE_IMMEDIATELY,
16
+ };
17
+
18
+
19
+ //////////////////////////////////////////////////////////////////////////////////////
20
+
21
+ void Init_sparse_native() {
22
+ VALUE PageRankModule, SparseNativeClass;
23
+
24
+ PageRankModule = rb_const_get(rb_cObject, rb_intern("PageRank"));
25
+ SparseNativeClass = rb_const_get(PageRankModule, rb_intern("SparseNative"));
26
+
27
+ rb_define_alloc_func(SparseNativeClass, sparse_native_allocate);
28
+ rb_define_private_method(SparseNativeClass, "_add_edge", sparse_native_add_edge, 3);
29
+ rb_define_private_method(SparseNativeClass, "_calculate", sparse_native_calculate, 3);
30
+ }
31
+
32
+ VALUE sparse_native_allocate(VALUE self) {
33
+ Graph g = malloc(GRAPH_SIZE);
34
+
35
+ // Grab a reference to the hash type used by a generic Ruby {}
36
+ // which accepts any key and any value. We'll need this type to create
37
+ // a st_table in which to put arbitrary VALUE keys. This hash type
38
+ // should be a static constant and thus should be safe to utilize without
39
+ // fear of garbage collection.
40
+ const struct st_hash_type *objhash = rb_hash_tbl(rb_hash_new(), "page_rank_sparse_native.c", 40)->type;
41
+
42
+ g->node_count = 0;
43
+ g->nodes = NULL;
44
+ g->dangling_nodes = NULL;
45
+ g->node_lookup = st_init_table_with_size(objhash, 0);
46
+
47
+ return TypedData_Wrap_Struct(self, &graph_typed_data, g);
48
+ }
49
+
50
+ VALUE sparse_native_add_edge(VALUE self, VALUE source, VALUE dest, VALUE weight) {
51
+ Graph g;
52
+
53
+ TypedData_Get_Struct(self, GraphStruct, &graph_typed_data, g);
54
+ add_edge_with_labels(g, source, dest, NUM2DBL(weight));
55
+ return Qnil;
56
+ }
57
+
58
+ VALUE sparse_native_calculate(VALUE self, VALUE max_iterations, VALUE damping, VALUE tolerance) {
59
+ Graph g;
60
+ VALUE ranks;
61
+
62
+ TypedData_Get_Struct(self, GraphStruct, &graph_typed_data, g);
63
+ calculate(g, FIX2INT(max_iterations), NUM2DBL(damping), NUM2DBL(tolerance));
64
+
65
+ ranks = rb_hash_new();
66
+ sort_and_normalize_ranks(g, rb_hash_dset, ranks);
67
+ return ranks;
68
+ }
69
+
70
+ void rb_hash_dset(VALUE hash, VALUE key, double value) {
71
+ rb_hash_aset(hash, key, DBL2NUM(value));
72
+ }
73
+
74
+ //////////////////////////////////////////////////////////////////////////////////////
75
+
76
+ void free_graph(void *data) {
77
+ Graph g = (Graph)data;
78
+ free_node_list(g->nodes, free_node);
79
+ free_node_list(g->dangling_nodes, NULL);
80
+ free(g->node_lookup);
81
+ free(g);
82
+ }
83
+
84
+ void free_node(Node n) {
85
+ free_edge_list(n->source_edges, free_edge);
86
+ free(n);
87
+ }
88
+
89
+ void free_node_list(NodeList nodes, void (*free_item)(Node)) {
90
+ while (nodes != NULL) {
91
+ NodeList tmp = nodes;
92
+ nodes = nodes->next;
93
+ if (free_item) {
94
+ free_item(tmp->node);
95
+ }
96
+ free(tmp);
97
+ }
98
+ }
99
+
100
+ void free_edge(Edge e) {
101
+ // Assume source node was allocated elsewhere and will be free'd elsewhere
102
+ free(e);
103
+ }
104
+
105
+ void free_edge_list(EdgeList edges, void (*free_item)(Edge)) {
106
+ while (edges != NULL) {
107
+ EdgeList tmp = edges;
108
+ edges = edges->next;
109
+ if (free_item) {
110
+ free_item(tmp->edge);
111
+ }
112
+ free(tmp);
113
+ }
114
+ }
115
+
116
+ //////////////////////////////////////////////////////////////////////////////////////
117
+
118
+ Node add_node(Graph g, VALUE label) {
119
+ NodeList tmp = malloc(NODE_LIST_SIZE);
120
+
121
+ tmp->node = malloc(NODE_SIZE);
122
+ tmp->node->label = label;
123
+ tmp->node->source_edges = NULL;
124
+ tmp->node->rank = 0.0;
125
+ tmp->node->prev_rank = 0.0;
126
+ tmp->node->outbound_weight_total = 0.0;
127
+
128
+ tmp->next = g->nodes;
129
+ g->nodes = tmp;
130
+ g->node_count += 1;
131
+
132
+ return tmp->node;
133
+ }
134
+
135
+ Node add_dangling_node(Graph g, Node n) {
136
+ NodeList tmp = malloc(NODE_LIST_SIZE);
137
+
138
+ tmp->node = n;
139
+ tmp->next = g->dangling_nodes;
140
+ g->dangling_nodes = tmp;
141
+
142
+ return n;
143
+ }
144
+
145
+ Edge add_edge(Node source, Node destination, double weight) {
146
+ EdgeList tmp = malloc(EDGE_LIST_SIZE);
147
+
148
+ tmp->edge = malloc(EDGE_SIZE);
149
+ tmp->edge->source = source;
150
+ tmp->edge->weight = weight;
151
+
152
+ tmp->next = destination->source_edges;
153
+ destination->source_edges = tmp;
154
+ source->outbound_weight_total += weight;
155
+
156
+ return tmp->edge;
157
+ }
158
+
159
+ Edge add_edge_with_labels(Graph g, VALUE source_label, VALUE dest_label, double weight) {
160
+ Node source, dest;
161
+
162
+ source = lookup_node(g, source_label);
163
+ dest = lookup_node(g, dest_label);
164
+
165
+ return add_edge(source, dest, weight);
166
+ }
167
+
168
+ Node lookup_node(Graph g, VALUE label) {
169
+ Node n;
170
+
171
+ if (!st_lookup(g->node_lookup, (st_data_t)label, (st_data_t *)&n)) {
172
+ n = add_node(g, label);
173
+ st_add_direct(g->node_lookup, (st_data_t)label, (st_data_t)n);
174
+ }
175
+ return n;
176
+ }
177
+
178
+ //////////////////////////////////////////////////////////////////////////////////////
179
+
180
+ void calculate_start(Graph g) {
181
+ NodeList nodes;
182
+ Node source, destination;
183
+ EdgeList edges;
184
+ Edge e;
185
+
186
+ for (nodes = g->nodes; nodes != NULL; nodes = nodes->next) {
187
+ destination = nodes->node;
188
+
189
+ // If there is no outband, this is a "dangling" node
190
+ if (destination->outbound_weight_total == 0.0) {
191
+ add_dangling_node(g, destination);
192
+ }
193
+
194
+ // Normalize all source edge weights
195
+ for (edges = destination->source_edges; edges != NULL; edges = edges->next) {
196
+ e = edges->edge;
197
+ source = e->source;
198
+ e->weight = e->weight / source->outbound_weight_total;
199
+ }
200
+
201
+ // Set the initial rank
202
+ destination->prev_rank = 0;
203
+ destination->rank = 1.0 / g->node_count;
204
+ }
205
+ }
206
+
207
+ void calculate_step(Graph g, double damping) {
208
+ NodeList nodes, dangling_nodes;
209
+ Node source, destination;
210
+ EdgeList edges;
211
+ Edge e;
212
+
213
+ // Set prev rank to rank for all nodes
214
+ for (nodes = g->nodes; nodes != NULL; nodes = nodes->next) {
215
+ destination = nodes->node;
216
+ destination->prev_rank = destination->rank;
217
+ }
218
+
219
+ // Re-destribute the rankings according to weight
220
+ for (nodes = g->nodes; nodes != NULL; nodes = nodes->next) {
221
+ destination = nodes->node;
222
+ double sum = 0.0;
223
+ for (edges = destination->source_edges; edges != NULL; edges = edges->next) {
224
+ e = edges->edge;
225
+ source = e->source;
226
+ sum += source->prev_rank * e->weight;
227
+ }
228
+ for (dangling_nodes = g->dangling_nodes; dangling_nodes != NULL; dangling_nodes = dangling_nodes->next) {
229
+ source = dangling_nodes->node;
230
+ sum += source->prev_rank / g->node_count;
231
+ }
232
+ destination->rank = damping * sum + (1 - damping) / g->node_count;
233
+ }
234
+ }
235
+
236
+ // Calculate the Euclidean distance from prev_rank to rank across all nodes
237
+ double prev_distance(Graph g) {
238
+ double sum_squares = 0.0;
239
+
240
+ for (NodeList nodes = g->nodes; nodes != NULL; nodes = nodes->next) {
241
+ Node n = nodes->node;
242
+ double rank_diff = n->prev_rank - n->rank;
243
+ sum_squares += rank_diff * rank_diff;
244
+ }
245
+
246
+ return sqrt(sum_squares);
247
+ }
248
+
249
+ void calculate(Graph g, int max_iterations, double damping, double tolerance) {
250
+ calculate_start(g);
251
+
252
+ while (max_iterations != 0) { // If negative one, allow to go without limit
253
+ calculate_step(g, damping);
254
+ if (prev_distance(g) < tolerance) {
255
+ break;
256
+ }
257
+ max_iterations--;
258
+ }
259
+ }
260
+
261
+ int node_compare(const void *v1, const void *v2) {
262
+ double rank1 = (*(Node *)v1)->rank;
263
+ double rank2 = (*(Node *)v2)->rank;
264
+ double cmp = rank2 - rank1; // Decreasing order
265
+ if (cmp < 0) return -1;
266
+ if (cmp > 0) return 1;
267
+ return 0;
268
+ }
269
+
270
+ void sort_and_normalize_ranks(Graph g, void (*callback)(VALUE, VALUE, double), VALUE callback_arg) {
271
+ NodeList nodes;
272
+ Node n;
273
+ double sum = 0.0;
274
+ unsigned long i;
275
+ Node *tmp;
276
+
277
+ i = g->node_count;
278
+ tmp = malloc(g->node_count * sizeof(Node));
279
+ for (nodes = g->nodes; nodes != NULL; nodes = nodes->next) {
280
+ n = nodes->node;
281
+ tmp[--i] = n;
282
+ sum += n->rank;
283
+ }
284
+
285
+ qsort(tmp, g->node_count, sizeof(Node), node_compare);
286
+
287
+ for (i = 0; i < g->node_count; i++) {
288
+ n = tmp[i];
289
+ callback(callback_arg, n->label, n->rank / sum);
290
+ }
291
+
292
+ free(tmp);
293
+ }
@@ -0,0 +1,93 @@
1
+ #ifndef PAGE_RANK_SPARSE_NATIVE_H
2
+ #define PAGE_RANK_SPARSE_NATIVE_H
3
+
4
+ #include <ruby.h>
5
+
6
+ struct NodeListStruct;
7
+ typedef struct NodeListStruct* NodeList;
8
+
9
+ typedef struct NodeListStruct {
10
+ struct NodeStruct *node;
11
+ struct NodeListStruct *next;
12
+ } NodeListStruct;
13
+
14
+ //////////////////////////////////////////////////////////////////////////////////////
15
+
16
+ struct EdgeListStruct;
17
+ typedef struct EdgeListStruct* EdgeList;
18
+
19
+ typedef struct EdgeListStruct {
20
+ struct EdgeStruct *edge;
21
+ struct EdgeListStruct *next;
22
+ } EdgeListStruct;
23
+
24
+ //////////////////////////////////////////////////////////////////////////////////////
25
+
26
+ struct NodeStruct;
27
+ typedef struct NodeStruct* Node;
28
+
29
+ typedef struct NodeStruct {
30
+ EdgeList source_edges;
31
+ VALUE label;
32
+ double prev_rank;
33
+ double rank;
34
+ double outbound_weight_total;
35
+ } NodeStruct;
36
+
37
+ //////////////////////////////////////////////////////////////////////////////////////
38
+
39
+ struct EdgeStruct;
40
+ typedef struct EdgeStruct* Edge;
41
+
42
+ typedef struct EdgeStruct {
43
+ Node source;
44
+ double weight;
45
+ } EdgeStruct;
46
+
47
+ //////////////////////////////////////////////////////////////////////////////////////
48
+
49
+ struct GraphStruct;
50
+ typedef struct GraphStruct* Graph;
51
+
52
+ typedef struct GraphStruct {
53
+ unsigned long node_count;
54
+ NodeList nodes;
55
+ NodeList dangling_nodes;
56
+ st_table *node_lookup;
57
+ } GraphStruct;
58
+
59
+ //////////////////////////////////////////////////////////////////////////////////////
60
+
61
+ void free_graph(void *data);
62
+ void free_node(Node n);
63
+ void free_node_list(NodeList nodes, void (*free_item)(Node));
64
+ void free_edge(Edge e);
65
+ void free_edge_list(EdgeList edges, void (*free_item)(Edge));
66
+
67
+ //////////////////////////////////////////////////////////////////////////////////////
68
+
69
+ Node add_node(Graph g, VALUE label);
70
+ Node add_dangling_node(Graph g, Node n);
71
+ Edge add_edge(Node source, Node destination, double weight);
72
+ Edge add_edge_with_labels(Graph g, VALUE source_label, VALUE dest_label, double weight);
73
+ Node lookup_node(Graph g, VALUE label);
74
+
75
+ //////////////////////////////////////////////////////////////////////////////////////
76
+
77
+ void calculate_start(Graph g);
78
+ void calculate_step(Graph g, double damping);
79
+ double prev_distance(Graph g);
80
+ void calculate(Graph g, int max_iterations, double damping, double tolerance);
81
+ int node_compare(const void *v1, const void *v2);
82
+ void sort_and_normalize_ranks(Graph g, void (*callback)(VALUE, VALUE, double), VALUE callback_arg);
83
+
84
+ //////////////////////////////////////////////////////////////////////////////////////
85
+
86
+ void Init_sparse_native();
87
+ VALUE sparse_native_allocate(VALUE self);
88
+ VALUE sparse_native_add_edge(VALUE self, VALUE source, VALUE dest, VALUE weight);
89
+ VALUE sparse_native_calculate(VALUE self, VALUE max_iterations, VALUE damping, VALUE tolerance);
90
+ VALUE sorted_and_normalized_ranks(Graph g);
91
+ void rb_hash_dset(VALUE hash, VALUE key, double value);
92
+
93
+ #endif
@@ -0,0 +1,6 @@
1
+ #include <page_rank_sparse_native.h>
2
+
3
+ // cppcheck-suppress unusedFunction
4
+ void Init_text_rank() {
5
+ Init_sparse_native();
6
+ }
@@ -7,6 +7,8 @@ module PageRank
7
7
  ##
8
8
  class Base
9
9
 
10
+ attr_reader :damping, :tolerance
11
+
10
12
  # @param (see #damping=)
11
13
  # @param (see #tolerance=)
12
14
  def initialize(damping: nil, tolerance: nil, **_)
@@ -48,7 +50,7 @@ module PageRank
48
50
 
49
51
  prev_ranks = ranks
50
52
  ranks = calculate_step(ranks)
51
- break if distance(ranks, prev_ranks) < @tolerance
53
+ break if distance(ranks, prev_ranks) < tolerance
52
54
 
53
55
  max_iterations -= 1
54
56
  end
@@ -81,7 +83,7 @@ module PageRank
81
83
  def distance(vector1, vector2)
82
84
  sum_squares = node_count.times.reduce(0.0) do |sum, i|
83
85
  d = vector1[i] - vector2[i]
84
- sum + d * d
86
+ sum + (d * d)
85
87
  end
86
88
  Math.sqrt(sum_squares)
87
89
  end
@@ -79,7 +79,7 @@ module PageRank
79
79
  total = total_out_weights[source_idx]
80
80
  if total
81
81
  w = @out_links[source_idx][dest_idx] || 0.0
82
- @damping * w / total + (1 - @damping) / node_count.to_f
82
+ (damping * w / total) + ((1 - damping) / node_count.to_f)
83
83
  else
84
84
  1.0 / node_count.to_f
85
85
  end
@@ -56,7 +56,7 @@ module PageRank
56
56
  w / @weight_totals[source]
57
57
  end
58
58
  end
59
- Hash[@nodes.map { |k| [k, 1.0 / node_count.to_f] }]
59
+ @nodes.to_h { |k| [k, 1.0 / node_count.to_f] }
60
60
  end
61
61
 
62
62
  def calculate_step(ranks)
@@ -68,14 +68,14 @@ module PageRank
68
68
  @dangling_nodes.each do |source|
69
69
  sum += ranks[source] / node_count.to_f
70
70
  end
71
- new_ranks[dest] = @damping * sum + (1 - @damping) / node_count
71
+ new_ranks[dest] = (damping * sum) + ((1 - damping) / node_count)
72
72
  end
73
73
  end
74
74
 
75
75
  def sort_ranks(ranks)
76
76
  sum = 0.0
77
77
  ranks.each { |_, v| sum += v }
78
- Hash[ranks.map { |k, v| [k, v / sum] }.sort_by { |_, v| -v }]
78
+ ranks.map { |k, v| [k, v / sum] }.sort_by { |_, v| -v }.to_h
79
79
  end
80
80
 
81
81
  def distance(vector1, vector2)
@@ -0,0 +1,21 @@
1
+ module PageRank
2
+ class SparseNative < Base
3
+
4
+ # require 'page_rank/sparse_native.so'
5
+
6
+ # @param (see Base#add)
7
+ # @param weight [Float] Optional weight for the graph edge
8
+ # @return (see Base#add)
9
+ def add(source, dest, weight: 1.0)
10
+ _add_edge(source, dest, weight) unless source == dest
11
+ end
12
+
13
+ # Perform the PageRank calculation
14
+ # @param max_iterations [Fixnum] Maximum number of PageRank iterations to perform (or -1 for no max)
15
+ # @return [Hash<Object, Float>] of nodes with rank
16
+ def calculate(max_iterations: -1, **_)
17
+ _calculate(max_iterations, damping, tolerance)
18
+ end
19
+
20
+ end
21
+ end
data/lib/page_rank.rb CHANGED
@@ -17,16 +17,17 @@ require 'set'
17
17
  ##
18
18
  module PageRank
19
19
 
20
- autoload :Base, 'page_rank/base'
21
- autoload :Dense, 'page_rank/dense'
22
- autoload :Sparse, 'page_rank/sparse'
20
+ autoload :Base, 'page_rank/base'
21
+ autoload :Dense, 'page_rank/dense'
22
+ autoload :Sparse, 'page_rank/sparse'
23
+ autoload :SparseNative, 'page_rank/sparse_native'
23
24
 
24
25
  # @option options [Symbol] :strategy PageRank strategy to use (either :sparse or :dense)
25
26
  # @option options [Float] :damping The probability of following the graph vs. randomly choosing a new node
26
27
  # @option options [Float] :tolerance The desired accuracy of the results
27
28
  # @return [PageRank::Base]
28
29
  def self.new(strategy: :sparse, **options)
29
- const_get(strategy.to_s.capitalize).new(**options)
30
+ const_get(strategy.to_s.split('_').map(&:capitalize).join).new(**options)
30
31
  end
31
32
 
32
33
  # Convenience method to quickly calculate PageRank. In the calling block, graph edges can be added.
@@ -3,14 +3,12 @@ module TextRank
3
3
  ##
4
4
  # Characater filter to transform non-ASCII (unicode) characters into ASCII-friendly versions.
5
5
  #
6
- # rubocop:disable Style/AsciiComments
7
6
  #
8
7
  # = Example
9
8
  #
10
9
  # AsciiFolding.new.filter!("the Perigordian Abbé then made answer, because a poor beggar of the country of Atrébatie heard some foolish things said")
11
10
  # => "the Perigordian Abbe then made answer, because a poor beggar of the country of Atrebatie heard some foolish things said"
12
11
  #
13
- # rubocop:enable Style/AsciiComments
14
12
  #
15
13
  ##
16
14
  class AsciiFolding
@@ -17,6 +17,7 @@ module TextRank
17
17
  class StripHtml < Nokogiri::XML::SAX::Document
18
18
 
19
19
  def initialize
20
+ super
20
21
  @text = StringIO.new
21
22
  end
22
23
 
@@ -57,7 +57,7 @@ module TextRank
57
57
  end
58
58
 
59
59
  # Calculates the "similarity" between this fingerprint and another
60
- # @param {Fingerprint} A second fingerprint to compare
60
+ # @param {Fingerprint} other A second fingerprint to compare
61
61
  # @return [Number] A number between 0.0 (different) and 1.0 (same)
62
62
  def similarity(other)
63
63
  return 1.0 if values == other.values # Short-circuit for efficiency
@@ -83,7 +83,7 @@ module TextRank
83
83
 
84
84
  def norm_factor
85
85
  @norm_factor ||= size.times.reduce(0.0) do |s, i|
86
- s + (i + 1) / Math.log(i + 2) / size.to_f
86
+ s + ((i + 1) / Math.log(i + 2) / size.to_f)
87
87
  end
88
88
  end
89
89
 
@@ -60,7 +60,7 @@ module TextRank
60
60
  # @param graph [PageRank::Base] a PageRank graph into which to add nodes/edges
61
61
  # return [nil]
62
62
  def build_graph(tokens, graph)
63
- ngram_window = @ngram_size * 2 + 1
63
+ ngram_window = (@ngram_size * 2) + 1
64
64
  tokens.size.times do |i|
65
65
  ngram_window.times do |j|
66
66
  consider_ngram_window(tokens, graph, i, j)
@@ -71,14 +71,14 @@ module TextRank
71
71
 
72
72
  private
73
73
 
74
- def consider_ngram_window(tokens, graph, i, j)
75
- return if j == @ngram_size || i + j < @ngram_size
74
+ def consider_ngram_window(tokens, graph, idx_i, idx_j)
75
+ return if idx_j == @ngram_size || idx_i + idx_j < @ngram_size
76
76
 
77
- token_i = tokens[i]
78
- token_j = tokens[i - @ngram_size + j]
77
+ token_i = tokens[idx_i]
78
+ token_j = tokens[idx_i - @ngram_size + idx_j]
79
79
 
80
80
  if token_j
81
- graph.add(token_i, token_j, weight: 1.0 / (j - @ngram_size).abs)
81
+ graph.add(token_i, token_j, weight: 1.0 / (idx_j - @ngram_size).abs)
82
82
  end
83
83
  end
84
84
 
@@ -71,7 +71,6 @@ module TextRank
71
71
  end
72
72
 
73
73
  # Sets the graph strategy for producing a graph from tokens
74
- # @param strategy [Class, Symbol, #build_graph] Strategy for producing a graph from tokens
75
74
  # @return [Class, Symbol, #build_graph]
76
75
  attr_writer :graph_strategy
77
76
 
@@ -103,14 +102,23 @@ module TextRank
103
102
  end
104
103
 
105
104
  # Filter & tokenize text, and return PageRank
106
- # @param text [String] unfiltered text to be processed
105
+ # @param text [String,Array<String>] unfiltered text to be processed
107
106
  # @return [Hash<String, Float>] tokens and page ranks (in descending order)
108
107
  def extract(text, **options)
109
- tokens = tokenize(text)
108
+ text = Array(text)
109
+ tokens_per_text = text.map do |t|
110
+ tokenize(t)
111
+ end
110
112
  graph = PageRank.new(**@page_rank_options)
111
- classify(@graph_strategy, context: GraphStrategy).build_graph(tokens, graph)
113
+ strategy = classify(@graph_strategy, context: GraphStrategy)
114
+ tokens_per_text.each do |tokens|
115
+ strategy.build_graph(tokens, graph)
116
+ end
112
117
  ranks = graph.calculate(**options)
113
- apply_rank_filters(ranks, tokens: tokens, original_text: text)
118
+ tokens_per_text.each_with_index do |tokens, i|
119
+ ranks = apply_rank_filters(ranks, tokens: tokens, original_text: text[i])
120
+ end
121
+ ranks
114
122
  end
115
123
 
116
124
  private
@@ -151,7 +151,7 @@ module TextRank
151
151
  # tokenization (e.g. ASCII folding). That's okay. We're just making the best effort we can
152
152
  # to find what we can.
153
153
  def scan_text_for_all_permutations_of(single_tokens)
154
- # NOTE that by reversing the order we craft the regex to prefer larger combinations over
154
+ # NOTE: that by reversing the order we craft the regex to prefer larger combinations over
155
155
  # smaller combinations (or singletons).
156
156
  perms = (1..@max_tokens_to_combine).to_a.reverse.flat_map do |n|
157
157
  scan_text_for_n_permutations_of(single_tokens, n)
@@ -162,8 +162,8 @@ module TextRank
162
162
  end unless perms.empty?
163
163
  end
164
164
 
165
- def scan_text_for_n_permutations_of(single_tokens, n)
166
- single_tokens.permutation(n).map do |perm|
165
+ def scan_text_for_n_permutations_of(single_tokens, n_perms)
166
+ single_tokens.permutation(n_perms).map do |perm|
167
167
  unless @permutations_scanned.key?(perm)
168
168
  @permutations_scanned[perm] = 0
169
169
  perm
@@ -14,7 +14,7 @@ module TextRank
14
14
  # @param ranks [Hash<String, Float>] the results of the PageRank algorithm
15
15
  # @return [Hash<String, Float>]
16
16
  def filter!(ranks, **_)
17
- Hash[ranks.sort_by { |_, v| @descending ? -v : v }]
17
+ ranks.sort_by { |_, v| @descending ? -v : v }.to_h
18
18
  end
19
19
 
20
20
  end
@@ -1,7 +1,7 @@
1
1
  module TextRank
2
2
  module Tokenizer
3
3
 
4
- CURRENCY_SYMBOLS = '[' + [
4
+ CURRENCY_SYMBOLS = "[#{[
5
5
  "\u00a4", # Generic Currency Symbol
6
6
  "\u0024", # Dollar Sign
7
7
  "\u00a2", # Cent Sign
@@ -26,14 +26,13 @@ module TextRank
26
26
  "\u20ab", # Dong Sign
27
27
  "\u0025", # Percent
28
28
  "\u2030", # Per Million
29
- ].join + ']'
29
+ ].join}]"
30
30
  private_constant :CURRENCY_SYMBOLS # Do not expose this to avoid confusion
31
31
 
32
32
  ##
33
33
  # A tokenizer regex that preserves money or formatted numbers as a single token. This
34
34
  # currently supports 24 different currency symbols:
35
35
  #
36
- # rubocop:disable Style/AsciiComments
37
36
  #
38
37
  # * ¤
39
38
  # * $
@@ -60,7 +59,6 @@ module TextRank
60
59
  # * %
61
60
  # * ‰
62
61
 
63
- # rubocop:enable Style/AsciiComments
64
62
  #
65
63
  # It also supports two alternative formats for negatives as well as optional three digit comma
66
64
  # separation and optional decimals.