text_rank 1.2.4 → 1.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,293 @@
1
+ #include <ruby.h>
2
+ #include <math.h>
3
+ #include <page_rank_sparse_native.h>
4
+
5
+ const size_t NODE_LIST_SIZE = sizeof(NodeListStruct);
6
+ const size_t EDGE_LIST_SIZE = sizeof(EdgeListStruct);
7
+ const size_t NODE_SIZE = sizeof(NodeStruct);
8
+ const size_t EDGE_SIZE = sizeof(EdgeStruct);
9
+ const size_t GRAPH_SIZE = sizeof(GraphStruct);
10
+
11
+ static const rb_data_type_t graph_typed_data = {
12
+ "PageRank/SparseNative/Graph",
13
+ { 0, free_graph, },
14
+ 0, 0,
15
+ RUBY_TYPED_FREE_IMMEDIATELY,
16
+ };
17
+
18
+
19
+ //////////////////////////////////////////////////////////////////////////////////////
20
+
21
+ void Init_sparse_native() {
22
+ VALUE PageRankModule, SparseNativeClass;
23
+
24
+ PageRankModule = rb_const_get(rb_cObject, rb_intern("PageRank"));
25
+ SparseNativeClass = rb_const_get(PageRankModule, rb_intern("SparseNative"));
26
+
27
+ rb_define_alloc_func(SparseNativeClass, sparse_native_allocate);
28
+ rb_define_private_method(SparseNativeClass, "_add_edge", sparse_native_add_edge, 3);
29
+ rb_define_private_method(SparseNativeClass, "_calculate", sparse_native_calculate, 3);
30
+ }
31
+
32
+ VALUE sparse_native_allocate(VALUE self) {
33
+ Graph g = malloc(GRAPH_SIZE);
34
+
35
+ // Grab a reference to the hash type used by a generic Ruby {}
36
+ // which accepts any key and any value. We'll need this type to create
37
+ // a st_table in which to put arbitrary VALUE keys. This hash type
38
+ // should be a static constant and thus should be safe to utilize without
39
+ // fear of garbage collection.
40
+ const struct st_hash_type *objhash = rb_hash_tbl(rb_hash_new(), "page_rank_sparse_native.c", 40)->type;
41
+
42
+ g->node_count = 0;
43
+ g->nodes = NULL;
44
+ g->dangling_nodes = NULL;
45
+ g->node_lookup = st_init_table_with_size(objhash, 0);
46
+
47
+ return TypedData_Wrap_Struct(self, &graph_typed_data, g);
48
+ }
49
+
50
+ VALUE sparse_native_add_edge(VALUE self, VALUE source, VALUE dest, VALUE weight) {
51
+ Graph g;
52
+
53
+ TypedData_Get_Struct(self, GraphStruct, &graph_typed_data, g);
54
+ add_edge_with_labels(g, source, dest, NUM2DBL(weight));
55
+ return Qnil;
56
+ }
57
+
58
+ VALUE sparse_native_calculate(VALUE self, VALUE max_iterations, VALUE damping, VALUE tolerance) {
59
+ Graph g;
60
+ VALUE ranks;
61
+
62
+ TypedData_Get_Struct(self, GraphStruct, &graph_typed_data, g);
63
+ calculate(g, FIX2INT(max_iterations), NUM2DBL(damping), NUM2DBL(tolerance));
64
+
65
+ ranks = rb_hash_new();
66
+ sort_and_normalize_ranks(g, rb_hash_dset, ranks);
67
+ return ranks;
68
+ }
69
+
70
+ void rb_hash_dset(VALUE hash, VALUE key, double value) {
71
+ rb_hash_aset(hash, key, DBL2NUM(value));
72
+ }
73
+
74
+ //////////////////////////////////////////////////////////////////////////////////////
75
+
76
+ void free_graph(void *data) {
77
+ Graph g = (Graph)data;
78
+ free_node_list(g->nodes, free_node);
79
+ free_node_list(g->dangling_nodes, NULL);
80
+ free(g->node_lookup);
81
+ free(g);
82
+ }
83
+
84
+ void free_node(Node n) {
85
+ free_edge_list(n->source_edges, free_edge);
86
+ free(n);
87
+ }
88
+
89
+ void free_node_list(NodeList nodes, void (*free_item)(Node)) {
90
+ while (nodes != NULL) {
91
+ NodeList tmp = nodes;
92
+ nodes = nodes->next;
93
+ if (free_item) {
94
+ free_item(tmp->node);
95
+ }
96
+ free(tmp);
97
+ }
98
+ }
99
+
100
+ void free_edge(Edge e) {
101
+ // Assume source node was allocated elsewhere and will be free'd elsewhere
102
+ free(e);
103
+ }
104
+
105
+ void free_edge_list(EdgeList edges, void (*free_item)(Edge)) {
106
+ while (edges != NULL) {
107
+ EdgeList tmp = edges;
108
+ edges = edges->next;
109
+ if (free_item) {
110
+ free_item(tmp->edge);
111
+ }
112
+ free(tmp);
113
+ }
114
+ }
115
+
116
+ //////////////////////////////////////////////////////////////////////////////////////
117
+
118
+ Node add_node(Graph g, VALUE label) {
119
+ NodeList tmp = malloc(NODE_LIST_SIZE);
120
+
121
+ tmp->node = malloc(NODE_SIZE);
122
+ tmp->node->label = label;
123
+ tmp->node->source_edges = NULL;
124
+ tmp->node->rank = 0.0;
125
+ tmp->node->prev_rank = 0.0;
126
+ tmp->node->outbound_weight_total = 0.0;
127
+
128
+ tmp->next = g->nodes;
129
+ g->nodes = tmp;
130
+ g->node_count += 1;
131
+
132
+ return tmp->node;
133
+ }
134
+
135
+ Node add_dangling_node(Graph g, Node n) {
136
+ NodeList tmp = malloc(NODE_LIST_SIZE);
137
+
138
+ tmp->node = n;
139
+ tmp->next = g->dangling_nodes;
140
+ g->dangling_nodes = tmp;
141
+
142
+ return n;
143
+ }
144
+
145
+ Edge add_edge(Node source, Node destination, double weight) {
146
+ EdgeList tmp = malloc(EDGE_LIST_SIZE);
147
+
148
+ tmp->edge = malloc(EDGE_SIZE);
149
+ tmp->edge->source = source;
150
+ tmp->edge->weight = weight;
151
+
152
+ tmp->next = destination->source_edges;
153
+ destination->source_edges = tmp;
154
+ source->outbound_weight_total += weight;
155
+
156
+ return tmp->edge;
157
+ }
158
+
159
+ Edge add_edge_with_labels(Graph g, VALUE source_label, VALUE dest_label, double weight) {
160
+ Node source, dest;
161
+
162
+ source = lookup_node(g, source_label);
163
+ dest = lookup_node(g, dest_label);
164
+
165
+ return add_edge(source, dest, weight);
166
+ }
167
+
168
+ Node lookup_node(Graph g, VALUE label) {
169
+ Node n;
170
+
171
+ if (!st_lookup(g->node_lookup, (st_data_t)label, (st_data_t *)&n)) {
172
+ n = add_node(g, label);
173
+ st_add_direct(g->node_lookup, (st_data_t)label, (st_data_t)n);
174
+ }
175
+ return n;
176
+ }
177
+
178
+ //////////////////////////////////////////////////////////////////////////////////////
179
+
180
+ void calculate_start(Graph g) {
181
+ NodeList nodes;
182
+ Node source, destination;
183
+ EdgeList edges;
184
+ Edge e;
185
+
186
+ for (nodes = g->nodes; nodes != NULL; nodes = nodes->next) {
187
+ destination = nodes->node;
188
+
189
+ // If there is no outband, this is a "dangling" node
190
+ if (destination->outbound_weight_total == 0.0) {
191
+ add_dangling_node(g, destination);
192
+ }
193
+
194
+ // Normalize all source edge weights
195
+ for (edges = destination->source_edges; edges != NULL; edges = edges->next) {
196
+ e = edges->edge;
197
+ source = e->source;
198
+ e->weight = e->weight / source->outbound_weight_total;
199
+ }
200
+
201
+ // Set the initial rank
202
+ destination->prev_rank = 0;
203
+ destination->rank = 1.0 / g->node_count;
204
+ }
205
+ }
206
+
207
+ void calculate_step(Graph g, double damping) {
208
+ NodeList nodes, dangling_nodes;
209
+ Node source, destination;
210
+ EdgeList edges;
211
+ Edge e;
212
+
213
+ // Set prev rank to rank for all nodes
214
+ for (nodes = g->nodes; nodes != NULL; nodes = nodes->next) {
215
+ destination = nodes->node;
216
+ destination->prev_rank = destination->rank;
217
+ }
218
+
219
+ // Re-destribute the rankings according to weight
220
+ for (nodes = g->nodes; nodes != NULL; nodes = nodes->next) {
221
+ destination = nodes->node;
222
+ double sum = 0.0;
223
+ for (edges = destination->source_edges; edges != NULL; edges = edges->next) {
224
+ e = edges->edge;
225
+ source = e->source;
226
+ sum += source->prev_rank * e->weight;
227
+ }
228
+ for (dangling_nodes = g->dangling_nodes; dangling_nodes != NULL; dangling_nodes = dangling_nodes->next) {
229
+ source = dangling_nodes->node;
230
+ sum += source->prev_rank / g->node_count;
231
+ }
232
+ destination->rank = damping * sum + (1 - damping) / g->node_count;
233
+ }
234
+ }
235
+
236
+ // Calculate the Euclidean distance from prev_rank to rank across all nodes
237
+ double prev_distance(Graph g) {
238
+ double sum_squares = 0.0;
239
+
240
+ for (NodeList nodes = g->nodes; nodes != NULL; nodes = nodes->next) {
241
+ Node n = nodes->node;
242
+ double rank_diff = n->prev_rank - n->rank;
243
+ sum_squares += rank_diff * rank_diff;
244
+ }
245
+
246
+ return sqrt(sum_squares);
247
+ }
248
+
249
+ void calculate(Graph g, int max_iterations, double damping, double tolerance) {
250
+ calculate_start(g);
251
+
252
+ while (max_iterations != 0) { // If negative one, allow to go without limit
253
+ calculate_step(g, damping);
254
+ if (prev_distance(g) < tolerance) {
255
+ break;
256
+ }
257
+ max_iterations--;
258
+ }
259
+ }
260
+
261
+ int node_compare(const void *v1, const void *v2) {
262
+ double rank1 = (*(Node *)v1)->rank;
263
+ double rank2 = (*(Node *)v2)->rank;
264
+ double cmp = rank2 - rank1; // Decreasing order
265
+ if (cmp < 0) return -1;
266
+ if (cmp > 0) return 1;
267
+ return 0;
268
+ }
269
+
270
+ void sort_and_normalize_ranks(Graph g, void (*callback)(VALUE, VALUE, double), VALUE callback_arg) {
271
+ NodeList nodes;
272
+ Node n;
273
+ double sum = 0.0;
274
+ unsigned long i;
275
+ Node *tmp;
276
+
277
+ i = g->node_count;
278
+ tmp = malloc(g->node_count * sizeof(Node));
279
+ for (nodes = g->nodes; nodes != NULL; nodes = nodes->next) {
280
+ n = nodes->node;
281
+ tmp[--i] = n;
282
+ sum += n->rank;
283
+ }
284
+
285
+ qsort(tmp, g->node_count, sizeof(Node), node_compare);
286
+
287
+ for (i = 0; i < g->node_count; i++) {
288
+ n = tmp[i];
289
+ callback(callback_arg, n->label, n->rank / sum);
290
+ }
291
+
292
+ free(tmp);
293
+ }
@@ -0,0 +1,93 @@
1
+ #ifndef PAGE_RANK_SPARSE_NATIVE_H
2
+ #define PAGE_RANK_SPARSE_NATIVE_H
3
+
4
+ #include <ruby.h>
5
+
6
+ struct NodeListStruct;
7
+ typedef struct NodeListStruct* NodeList;
8
+
9
+ typedef struct NodeListStruct {
10
+ struct NodeStruct *node;
11
+ struct NodeListStruct *next;
12
+ } NodeListStruct;
13
+
14
+ //////////////////////////////////////////////////////////////////////////////////////
15
+
16
+ struct EdgeListStruct;
17
+ typedef struct EdgeListStruct* EdgeList;
18
+
19
+ typedef struct EdgeListStruct {
20
+ struct EdgeStruct *edge;
21
+ struct EdgeListStruct *next;
22
+ } EdgeListStruct;
23
+
24
+ //////////////////////////////////////////////////////////////////////////////////////
25
+
26
+ struct NodeStruct;
27
+ typedef struct NodeStruct* Node;
28
+
29
+ typedef struct NodeStruct {
30
+ EdgeList source_edges;
31
+ VALUE label;
32
+ double prev_rank;
33
+ double rank;
34
+ double outbound_weight_total;
35
+ } NodeStruct;
36
+
37
+ //////////////////////////////////////////////////////////////////////////////////////
38
+
39
+ struct EdgeStruct;
40
+ typedef struct EdgeStruct* Edge;
41
+
42
+ typedef struct EdgeStruct {
43
+ Node source;
44
+ double weight;
45
+ } EdgeStruct;
46
+
47
+ //////////////////////////////////////////////////////////////////////////////////////
48
+
49
+ struct GraphStruct;
50
+ typedef struct GraphStruct* Graph;
51
+
52
+ typedef struct GraphStruct {
53
+ unsigned long node_count;
54
+ NodeList nodes;
55
+ NodeList dangling_nodes;
56
+ st_table *node_lookup;
57
+ } GraphStruct;
58
+
59
+ //////////////////////////////////////////////////////////////////////////////////////
60
+
61
+ void free_graph(void *data);
62
+ void free_node(Node n);
63
+ void free_node_list(NodeList nodes, void (*free_item)(Node));
64
+ void free_edge(Edge e);
65
+ void free_edge_list(EdgeList edges, void (*free_item)(Edge));
66
+
67
+ //////////////////////////////////////////////////////////////////////////////////////
68
+
69
+ Node add_node(Graph g, VALUE label);
70
+ Node add_dangling_node(Graph g, Node n);
71
+ Edge add_edge(Node source, Node destination, double weight);
72
+ Edge add_edge_with_labels(Graph g, VALUE source_label, VALUE dest_label, double weight);
73
+ Node lookup_node(Graph g, VALUE label);
74
+
75
+ //////////////////////////////////////////////////////////////////////////////////////
76
+
77
+ void calculate_start(Graph g);
78
+ void calculate_step(Graph g, double damping);
79
+ double prev_distance(Graph g);
80
+ void calculate(Graph g, int max_iterations, double damping, double tolerance);
81
+ int node_compare(const void *v1, const void *v2);
82
+ void sort_and_normalize_ranks(Graph g, void (*callback)(VALUE, VALUE, double), VALUE callback_arg);
83
+
84
+ //////////////////////////////////////////////////////////////////////////////////////
85
+
86
+ void Init_sparse_native();
87
+ VALUE sparse_native_allocate(VALUE self);
88
+ VALUE sparse_native_add_edge(VALUE self, VALUE source, VALUE dest, VALUE weight);
89
+ VALUE sparse_native_calculate(VALUE self, VALUE max_iterations, VALUE damping, VALUE tolerance);
90
+ VALUE sorted_and_normalized_ranks(Graph g);
91
+ void rb_hash_dset(VALUE hash, VALUE key, double value);
92
+
93
+ #endif
@@ -0,0 +1,6 @@
1
+ #include <page_rank_sparse_native.h>
2
+
3
+ // cppcheck-suppress unusedFunction
4
+ void Init_text_rank() {
5
+ Init_sparse_native();
6
+ }
@@ -7,6 +7,8 @@ module PageRank
7
7
  ##
8
8
  class Base
9
9
 
10
+ attr_reader :damping, :tolerance
11
+
10
12
  # @param (see #damping=)
11
13
  # @param (see #tolerance=)
12
14
  def initialize(damping: nil, tolerance: nil, **_)
@@ -48,7 +50,7 @@ module PageRank
48
50
 
49
51
  prev_ranks = ranks
50
52
  ranks = calculate_step(ranks)
51
- break if distance(ranks, prev_ranks) < @tolerance
53
+ break if distance(ranks, prev_ranks) < tolerance
52
54
 
53
55
  max_iterations -= 1
54
56
  end
@@ -81,7 +83,7 @@ module PageRank
81
83
  def distance(vector1, vector2)
82
84
  sum_squares = node_count.times.reduce(0.0) do |sum, i|
83
85
  d = vector1[i] - vector2[i]
84
- sum + d * d
86
+ sum + (d * d)
85
87
  end
86
88
  Math.sqrt(sum_squares)
87
89
  end
@@ -79,7 +79,7 @@ module PageRank
79
79
  total = total_out_weights[source_idx]
80
80
  if total
81
81
  w = @out_links[source_idx][dest_idx] || 0.0
82
- @damping * w / total + (1 - @damping) / node_count.to_f
82
+ (damping * w / total) + ((1 - damping) / node_count.to_f)
83
83
  else
84
84
  1.0 / node_count.to_f
85
85
  end
@@ -56,7 +56,7 @@ module PageRank
56
56
  w / @weight_totals[source]
57
57
  end
58
58
  end
59
- Hash[@nodes.map { |k| [k, 1.0 / node_count.to_f] }]
59
+ @nodes.to_h { |k| [k, 1.0 / node_count.to_f] }
60
60
  end
61
61
 
62
62
  def calculate_step(ranks)
@@ -68,14 +68,14 @@ module PageRank
68
68
  @dangling_nodes.each do |source|
69
69
  sum += ranks[source] / node_count.to_f
70
70
  end
71
- new_ranks[dest] = @damping * sum + (1 - @damping) / node_count
71
+ new_ranks[dest] = (damping * sum) + ((1 - damping) / node_count)
72
72
  end
73
73
  end
74
74
 
75
75
  def sort_ranks(ranks)
76
76
  sum = 0.0
77
77
  ranks.each { |_, v| sum += v }
78
- Hash[ranks.map { |k, v| [k, v / sum] }.sort_by { |_, v| -v }]
78
+ ranks.map { |k, v| [k, v / sum] }.sort_by { |_, v| -v }.to_h
79
79
  end
80
80
 
81
81
  def distance(vector1, vector2)
@@ -0,0 +1,21 @@
1
+ module PageRank
2
+ class SparseNative < Base
3
+
4
+ # require 'page_rank/sparse_native.so'
5
+
6
+ # @param (see Base#add)
7
+ # @param weight [Float] Optional weight for the graph edge
8
+ # @return (see Base#add)
9
+ def add(source, dest, weight: 1.0)
10
+ _add_edge(source, dest, weight) unless source == dest
11
+ end
12
+
13
+ # Perform the PageRank calculation
14
+ # @param max_iterations [Fixnum] Maximum number of PageRank iterations to perform (or -1 for no max)
15
+ # @return [Hash<Object, Float>] of nodes with rank
16
+ def calculate(max_iterations: -1, **_)
17
+ _calculate(max_iterations, damping, tolerance)
18
+ end
19
+
20
+ end
21
+ end
data/lib/page_rank.rb CHANGED
@@ -17,16 +17,17 @@ require 'set'
17
17
  ##
18
18
  module PageRank
19
19
 
20
- autoload :Base, 'page_rank/base'
21
- autoload :Dense, 'page_rank/dense'
22
- autoload :Sparse, 'page_rank/sparse'
20
+ autoload :Base, 'page_rank/base'
21
+ autoload :Dense, 'page_rank/dense'
22
+ autoload :Sparse, 'page_rank/sparse'
23
+ autoload :SparseNative, 'page_rank/sparse_native'
23
24
 
24
25
  # @option options [Symbol] :strategy PageRank strategy to use (either :sparse or :dense)
25
26
  # @option options [Float] :damping The probability of following the graph vs. randomly choosing a new node
26
27
  # @option options [Float] :tolerance The desired accuracy of the results
27
28
  # @return [PageRank::Base]
28
29
  def self.new(strategy: :sparse, **options)
29
- const_get(strategy.to_s.capitalize).new(**options)
30
+ const_get(strategy.to_s.split('_').map(&:capitalize).join).new(**options)
30
31
  end
31
32
 
32
33
  # Convenience method to quickly calculate PageRank. In the calling block, graph edges can be added.
@@ -3,14 +3,12 @@ module TextRank
3
3
  ##
4
4
  # Characater filter to transform non-ASCII (unicode) characters into ASCII-friendly versions.
5
5
  #
6
- # rubocop:disable Style/AsciiComments
7
6
  #
8
7
  # = Example
9
8
  #
10
9
  # AsciiFolding.new.filter!("the Perigordian Abbé then made answer, because a poor beggar of the country of Atrébatie heard some foolish things said")
11
10
  # => "the Perigordian Abbe then made answer, because a poor beggar of the country of Atrebatie heard some foolish things said"
12
11
  #
13
- # rubocop:enable Style/AsciiComments
14
12
  #
15
13
  ##
16
14
  class AsciiFolding
@@ -17,6 +17,7 @@ module TextRank
17
17
  class StripHtml < Nokogiri::XML::SAX::Document
18
18
 
19
19
  def initialize
20
+ super
20
21
  @text = StringIO.new
21
22
  end
22
23
 
@@ -57,7 +57,7 @@ module TextRank
57
57
  end
58
58
 
59
59
  # Calculates the "similarity" between this fingerprint and another
60
- # @param {Fingerprint} A second fingerprint to compare
60
+ # @param {Fingerprint} other A second fingerprint to compare
61
61
  # @return [Number] A number between 0.0 (different) and 1.0 (same)
62
62
  def similarity(other)
63
63
  return 1.0 if values == other.values # Short-circuit for efficiency
@@ -83,7 +83,7 @@ module TextRank
83
83
 
84
84
  def norm_factor
85
85
  @norm_factor ||= size.times.reduce(0.0) do |s, i|
86
- s + (i + 1) / Math.log(i + 2) / size.to_f
86
+ s + ((i + 1) / Math.log(i + 2) / size.to_f)
87
87
  end
88
88
  end
89
89
 
@@ -60,7 +60,7 @@ module TextRank
60
60
  # @param graph [PageRank::Base] a PageRank graph into which to add nodes/edges
61
61
  # return [nil]
62
62
  def build_graph(tokens, graph)
63
- ngram_window = @ngram_size * 2 + 1
63
+ ngram_window = (@ngram_size * 2) + 1
64
64
  tokens.size.times do |i|
65
65
  ngram_window.times do |j|
66
66
  consider_ngram_window(tokens, graph, i, j)
@@ -71,14 +71,14 @@ module TextRank
71
71
 
72
72
  private
73
73
 
74
- def consider_ngram_window(tokens, graph, i, j)
75
- return if j == @ngram_size || i + j < @ngram_size
74
+ def consider_ngram_window(tokens, graph, idx_i, idx_j)
75
+ return if idx_j == @ngram_size || idx_i + idx_j < @ngram_size
76
76
 
77
- token_i = tokens[i]
78
- token_j = tokens[i - @ngram_size + j]
77
+ token_i = tokens[idx_i]
78
+ token_j = tokens[idx_i - @ngram_size + idx_j]
79
79
 
80
80
  if token_j
81
- graph.add(token_i, token_j, weight: 1.0 / (j - @ngram_size).abs)
81
+ graph.add(token_i, token_j, weight: 1.0 / (idx_j - @ngram_size).abs)
82
82
  end
83
83
  end
84
84
 
@@ -71,7 +71,6 @@ module TextRank
71
71
  end
72
72
 
73
73
  # Sets the graph strategy for producing a graph from tokens
74
- # @param strategy [Class, Symbol, #build_graph] Strategy for producing a graph from tokens
75
74
  # @return [Class, Symbol, #build_graph]
76
75
  attr_writer :graph_strategy
77
76
 
@@ -103,14 +102,23 @@ module TextRank
103
102
  end
104
103
 
105
104
  # Filter & tokenize text, and return PageRank
106
- # @param text [String] unfiltered text to be processed
105
+ # @param text [String,Array<String>] unfiltered text to be processed
107
106
  # @return [Hash<String, Float>] tokens and page ranks (in descending order)
108
107
  def extract(text, **options)
109
- tokens = tokenize(text)
108
+ text = Array(text)
109
+ tokens_per_text = text.map do |t|
110
+ tokenize(t)
111
+ end
110
112
  graph = PageRank.new(**@page_rank_options)
111
- classify(@graph_strategy, context: GraphStrategy).build_graph(tokens, graph)
113
+ strategy = classify(@graph_strategy, context: GraphStrategy)
114
+ tokens_per_text.each do |tokens|
115
+ strategy.build_graph(tokens, graph)
116
+ end
112
117
  ranks = graph.calculate(**options)
113
- apply_rank_filters(ranks, tokens: tokens, original_text: text)
118
+ tokens_per_text.each_with_index do |tokens, i|
119
+ ranks = apply_rank_filters(ranks, tokens: tokens, original_text: text[i])
120
+ end
121
+ ranks
114
122
  end
115
123
 
116
124
  private
@@ -151,7 +151,7 @@ module TextRank
151
151
  # tokenization (e.g. ASCII folding). That's okay. We're just making the best effort we can
152
152
  # to find what we can.
153
153
  def scan_text_for_all_permutations_of(single_tokens)
154
- # NOTE that by reversing the order we craft the regex to prefer larger combinations over
154
+ # NOTE: that by reversing the order we craft the regex to prefer larger combinations over
155
155
  # smaller combinations (or singletons).
156
156
  perms = (1..@max_tokens_to_combine).to_a.reverse.flat_map do |n|
157
157
  scan_text_for_n_permutations_of(single_tokens, n)
@@ -162,8 +162,8 @@ module TextRank
162
162
  end unless perms.empty?
163
163
  end
164
164
 
165
- def scan_text_for_n_permutations_of(single_tokens, n)
166
- single_tokens.permutation(n).map do |perm|
165
+ def scan_text_for_n_permutations_of(single_tokens, n_perms)
166
+ single_tokens.permutation(n_perms).map do |perm|
167
167
  unless @permutations_scanned.key?(perm)
168
168
  @permutations_scanned[perm] = 0
169
169
  perm
@@ -14,7 +14,7 @@ module TextRank
14
14
  # @param ranks [Hash<String, Float>] the results of the PageRank algorithm
15
15
  # @return [Hash<String, Float>]
16
16
  def filter!(ranks, **_)
17
- Hash[ranks.sort_by { |_, v| @descending ? -v : v }]
17
+ ranks.sort_by { |_, v| @descending ? -v : v }.to_h
18
18
  end
19
19
 
20
20
  end
@@ -1,7 +1,7 @@
1
1
  module TextRank
2
2
  module Tokenizer
3
3
 
4
- CURRENCY_SYMBOLS = '[' + [
4
+ CURRENCY_SYMBOLS = "[#{[
5
5
  "\u00a4", # Generic Currency Symbol
6
6
  "\u0024", # Dollar Sign
7
7
  "\u00a2", # Cent Sign
@@ -26,14 +26,13 @@ module TextRank
26
26
  "\u20ab", # Dong Sign
27
27
  "\u0025", # Percent
28
28
  "\u2030", # Per Million
29
- ].join + ']'
29
+ ].join}]"
30
30
  private_constant :CURRENCY_SYMBOLS # Do not expose this to avoid confusion
31
31
 
32
32
  ##
33
33
  # A tokenizer regex that preserves money or formatted numbers as a single token. This
34
34
  # currently supports 24 different currency symbols:
35
35
  #
36
- # rubocop:disable Style/AsciiComments
37
36
  #
38
37
  # * ¤
39
38
  # * $
@@ -60,7 +59,6 @@ module TextRank
60
59
  # * %
61
60
  # * ‰
62
61
 
63
- # rubocop:enable Style/AsciiComments
64
62
  #
65
63
  # It also supports two alternative formats for negatives as well as optional three digit comma
66
64
  # separation and optional decimals.