text_rank 1.2.0 → 1.2.9

Sign up to get free protection for your applications and to get access to all the features.
Files changed (45) hide show
  1. checksums.yaml +5 -5
  2. data/.codeclimate.yml +1 -6
  3. data/.gitignore +4 -0
  4. data/.rubocop.yml +60 -1075
  5. data/.ruby-version +1 -1
  6. data/.travis.yml +14 -5
  7. data/{LICENSE.txt → LICENSE} +0 -0
  8. data/README.md +2 -1
  9. data/Rakefile +5 -0
  10. data/bin/console +3 -3
  11. data/ext/text_rank/extconf.rb +3 -0
  12. data/ext/text_rank/page_rank_sparse_native.c +296 -0
  13. data/ext/text_rank/page_rank_sparse_native.h +93 -0
  14. data/ext/text_rank/text_rank.c +5 -0
  15. data/lib/page_rank.rb +7 -4
  16. data/lib/page_rank/base.rb +12 -9
  17. data/lib/page_rank/dense.rb +3 -2
  18. data/lib/page_rank/sparse.rb +6 -7
  19. data/lib/page_rank/sparse_native.rb +21 -0
  20. data/lib/text_rank.rb +14 -9
  21. data/lib/text_rank/char_filter.rb +1 -1
  22. data/lib/text_rank/char_filter/ascii_folding.rb +5 -1
  23. data/lib/text_rank/char_filter/strip_possessive.rb +2 -2
  24. data/lib/text_rank/char_filter/undo_contractions.rb +1 -137
  25. data/lib/text_rank/char_filter/undo_contractions.yml +135 -0
  26. data/lib/text_rank/fingerprint.rb +10 -18
  27. data/lib/text_rank/fingerprint_overlap.rb +55 -0
  28. data/lib/text_rank/graph_strategy/coocurrence.rb +15 -6
  29. data/lib/text_rank/keyword_extractor.rb +32 -25
  30. data/lib/text_rank/rank_filter/collapse_adjacent.rb +53 -26
  31. data/lib/text_rank/rank_filter/normalize_probability.rb +2 -1
  32. data/lib/text_rank/rank_filter/normalize_unit_vector.rb +2 -1
  33. data/lib/text_rank/token_filter/part_of_speech.rb +0 -1
  34. data/lib/text_rank/token_filter/stopwords.rb +1 -321
  35. data/lib/text_rank/token_filter/stopwords.yml +317 -0
  36. data/lib/text_rank/tokenizer.rb +1 -1
  37. data/lib/text_rank/tokenizer/money.rb +11 -6
  38. data/lib/text_rank/tokenizer/number.rb +4 -3
  39. data/lib/text_rank/tokenizer/punctuation.rb +4 -1
  40. data/lib/text_rank/tokenizer/url.rb +3 -0
  41. data/lib/text_rank/tokenizer/whitespace.rb +4 -1
  42. data/lib/text_rank/tokenizer/word.rb +5 -2
  43. data/lib/text_rank/version.rb +3 -1
  44. data/text_rank.gemspec +12 -10
  45. metadata +69 -33
data/.ruby-version CHANGED
@@ -1 +1 @@
1
- ruby-2.1.2
1
+ ruby-2.5.1
data/.travis.yml CHANGED
@@ -1,7 +1,16 @@
1
+ env:
2
+ global:
3
+ - CC_TEST_REPORTER_ID=6ab030bf370ffc2abbf0ba4d70a1c8d9649f6fd1426f48f6d43d5c9eb15f187f
1
4
  language: ruby
2
5
  rvm:
3
- - 2.1.2
4
- before_install: gem install bundler -v 1.11.2
5
- addons:
6
- code_climate:
7
- repo_token: 6ab030bf370ffc2abbf0ba4d70a1c8d9649f6fd1426f48f6d43d5c9eb15f187f
6
+ - 2.5.1
7
+ before_install: gem install bundler -v 1.17.3
8
+ before_script:
9
+ - curl -L https://codeclimate.com/downloads/test-reporter/test-reporter-latest-linux-amd64 > ./cc-test-reporter
10
+ - chmod +x ./cc-test-reporter
11
+ - ./cc-test-reporter before-build
12
+ - bundle exec rake compile
13
+ script:
14
+ - bundle exec rspec
15
+ after_script:
16
+ - ./cc-test-reporter after-build --exit-code $TRAVIS_TEST_RESULT
File without changes
data/README.md CHANGED
@@ -7,6 +7,7 @@
7
7
 
8
8
  ## Status
9
9
 
10
+ [![Gem Version](https://badge.fury.io/rb/text_rank.svg)](https://badge.fury.io/rb/text_rank)
10
11
  [![Travis Build Status](https://travis-ci.org/david-mccullars/text_rank.svg?branch=master)](https://travis-ci.org/david-mccullars/text_rank)
11
12
  [![Code Climate](https://codeclimate.com/github/david-mccullars/text_rank/badges/gpa.svg)](https://codeclimate.com/github/david-mccullars/text_rank)
12
13
  [![Test Coverage](https://codeclimate.com/github/david-mccullars/text_rank/badges/coverage.svg)](https://codeclimate.com/github/david-mccullars/text_rank/coverage)
@@ -127,7 +128,7 @@ multiplication. Each iteration is O(N^3) where N is the number of graph nodes.
127
128
 
128
129
  ## License
129
130
 
130
- MIT. See the `LICENSE.txt` file.
131
+ MIT. See the `LICENSE` file.
131
132
 
132
133
 
133
134
  ## References
data/Rakefile CHANGED
@@ -1,4 +1,5 @@
1
1
  require "bundler/gem_tasks"
2
+ require "rake/extensiontask"
2
3
  require "rspec/core/rake_task"
3
4
 
4
5
  RSpec::Core::RakeTask.new(:spec)
@@ -10,3 +11,7 @@ RDoc::Task.new do |rdoc|
10
11
  rdoc.main = "README.md"
11
12
  rdoc.rdoc_files.include("README.md", "lib/**/*.rb")
12
13
  end
14
+
15
+ Rake::ExtensionTask.new('text_rank') do |ext|
16
+ ext.lib_dir = 'lib/text_rank'
17
+ end
data/bin/console CHANGED
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require "bundler/setup"
4
- require "text_rank"
3
+ require 'bundler/setup'
4
+ require 'text_rank'
5
5
 
6
6
  # You can add fixtures and/or initialization code here to make experimenting
7
7
  # with your gem easier. You can also use a different console, if you like.
@@ -10,5 +10,5 @@ require "text_rank"
10
10
  # require "pry"
11
11
  # Pry.start
12
12
 
13
- require "irb"
13
+ require 'irb'
14
14
  IRB.start
@@ -0,0 +1,3 @@
1
+ require "mkmf"
2
+
3
+ create_makefile('text_rank/text_rank')
@@ -0,0 +1,296 @@
1
+ #include <ruby.h>
2
+ #include <math.h>
3
+ #include <page_rank_sparse_native.h>
4
+
5
+ const size_t NODE_LIST_SIZE = sizeof(NodeListStruct);
6
+ const size_t EDGE_LIST_SIZE = sizeof(EdgeListStruct);
7
+ const size_t NODE_SIZE = sizeof(NodeStruct);
8
+ const size_t EDGE_SIZE = sizeof(EdgeStruct);
9
+ const size_t GRAPH_SIZE = sizeof(GraphStruct);
10
+
11
+ static const rb_data_type_t graph_typed_data = {
12
+ "PageRank/SparseNative/Graph",
13
+ { 0, free_graph, },
14
+ 0, 0,
15
+ RUBY_TYPED_FREE_IMMEDIATELY,
16
+ };
17
+
18
+
19
+ //////////////////////////////////////////////////////////////////////////////////////
20
+
21
+ void Init_sparse_native() {
22
+ VALUE PageRankModule, SparseNativeClass;
23
+
24
+ PageRankModule = rb_const_get(rb_cObject, rb_intern("PageRank"));
25
+ SparseNativeClass = rb_const_get(PageRankModule, rb_intern("SparseNative"));
26
+
27
+ rb_define_alloc_func(SparseNativeClass, sparse_native_allocate);
28
+ rb_define_private_method(SparseNativeClass, "_add_edge", sparse_native_add_edge, 3);
29
+ rb_define_private_method(SparseNativeClass, "_calculate", sparse_native_calculate, 3);
30
+ }
31
+
32
+ VALUE sparse_native_allocate(VALUE self) {
33
+ Graph g = malloc(GRAPH_SIZE);
34
+ //st_table *tmp, *node_lookup;
35
+
36
+ const struct st_hash_type *objhash = rb_hash_tbl(rb_hash_new())->type;
37
+
38
+ g->node_count = 0;
39
+ g->nodes = NULL;
40
+ g->dangling_nodes = NULL;
41
+ g->node_lookup = st_init_table_with_size(objhash, 0);
42
+
43
+ return TypedData_Wrap_Struct(self, &graph_typed_data, g);
44
+ }
45
+
46
+ VALUE sparse_native_add_edge(VALUE self, VALUE source, VALUE dest, VALUE weight) {
47
+ Graph g;
48
+
49
+ TypedData_Get_Struct(self, GraphStruct, &graph_typed_data, g);
50
+ add_edge_with_labels(g, source, dest, NUM2DBL(weight));
51
+ return Qnil;
52
+ }
53
+
54
+ VALUE sparse_native_calculate(VALUE self, VALUE max_iterations, VALUE damping, VALUE tolerance) {
55
+ Graph g;
56
+ VALUE ranks;
57
+
58
+ TypedData_Get_Struct(self, GraphStruct, &graph_typed_data, g);
59
+ calculate(g, FIX2INT(max_iterations), NUM2DBL(damping), NUM2DBL(tolerance));
60
+
61
+ ranks = rb_hash_new();
62
+ sort_and_normalize_ranks(g, rb_hash_dset, ranks);
63
+ return ranks;
64
+ }
65
+
66
+ void rb_hash_dset(VALUE hash, VALUE key, double value) {
67
+ rb_hash_aset(hash, key, DBL2NUM(value));
68
+ }
69
+
70
+ //////////////////////////////////////////////////////////////////////////////////////
71
+
72
+ void free_graph(void *data) {
73
+ Graph g = (Graph)data;
74
+ free_node_list(g->nodes, free_node);
75
+ free_node_list(g->dangling_nodes, NULL);
76
+ free(g->node_lookup);
77
+ free(g);
78
+ }
79
+
80
+ void free_node(Node n) {
81
+ free_edge_list(n->source_edges, free_edge);
82
+ free(n);
83
+ }
84
+
85
+ void free_node_list(NodeList nodes, void (*free_item)(Node)) {
86
+ NodeList tmp;
87
+ while (nodes != NULL) {
88
+ tmp = nodes;
89
+ nodes = nodes->next;
90
+ if (free_item) {
91
+ free_item(tmp->node);
92
+ }
93
+ free(tmp);
94
+ }
95
+ }
96
+
97
+ void free_edge(Edge e) {
98
+ // Assume source node was allocated elsewhere and will be free'd elsewhere
99
+ free(e);
100
+ }
101
+
102
+ void free_edge_list(EdgeList edges, void (*free_item)(Edge)) {
103
+ EdgeList tmp;
104
+ while (edges != NULL) {
105
+ tmp = edges;
106
+ edges = edges->next;
107
+ if (free_item) {
108
+ free_item(tmp->edge);
109
+ }
110
+ free(tmp);
111
+ }
112
+ }
113
+
114
+ //////////////////////////////////////////////////////////////////////////////////////
115
+
116
+ Node add_node(Graph g, VALUE label) {
117
+ NodeList tmp = malloc(NODE_LIST_SIZE);
118
+
119
+ tmp->node = malloc(NODE_SIZE);
120
+ tmp->node->label = label;
121
+ tmp->node->source_edges = NULL;
122
+ tmp->node->rank = 0.0;
123
+ tmp->node->prev_rank = 0.0;
124
+ tmp->node->outbound_weight_total = 0.0;
125
+
126
+ tmp->next = g->nodes;
127
+ g->nodes = tmp;
128
+ g->node_count += 1;
129
+
130
+ return tmp->node;
131
+ }
132
+
133
+ Node add_dangling_node(Graph g, Node n) {
134
+ NodeList tmp = malloc(NODE_LIST_SIZE);
135
+
136
+ tmp->node = n;
137
+ tmp->next = g->dangling_nodes;
138
+ g->dangling_nodes = tmp;
139
+
140
+ return n;
141
+ }
142
+
143
+ Edge add_edge(Node source, Node destination, double weight) {
144
+ EdgeList tmp = malloc(EDGE_LIST_SIZE);
145
+
146
+ tmp->edge = malloc(EDGE_SIZE);
147
+ tmp->edge->source = source;
148
+ tmp->edge->weight = weight;
149
+
150
+ tmp->next = destination->source_edges;
151
+ destination->source_edges = tmp;
152
+ source->outbound_weight_total += weight;
153
+
154
+ return tmp->edge;
155
+ }
156
+
157
+ Edge add_edge_with_labels(Graph g, VALUE source_label, VALUE dest_label, double weight) {
158
+ Node source, dest;
159
+
160
+ source = lookup_node(g, source_label);
161
+ dest = lookup_node(g, dest_label);
162
+
163
+ return add_edge(source, dest, weight);
164
+ }
165
+
166
+ Node lookup_node(Graph g, VALUE label) {
167
+ Node n;
168
+
169
+ if (!st_lookup(g->node_lookup, (st_data_t)label, (st_data_t *)&n)) {
170
+ n = add_node(g, label);
171
+ st_add_direct(g->node_lookup, (st_data_t)label, (st_data_t)n);
172
+ }
173
+ return n;
174
+ }
175
+
176
+ //////////////////////////////////////////////////////////////////////////////////////
177
+
178
+ void calculate_start(Graph g) {
179
+ NodeList nodes;
180
+ Node source, destination;
181
+ EdgeList edges;
182
+ Edge e;
183
+
184
+ for (nodes = g->nodes; nodes != NULL; nodes = nodes->next) {
185
+ destination = nodes->node;
186
+
187
+ // If there is no outband, this is a "dangling" node
188
+ if (destination->outbound_weight_total == 0.0) {
189
+ add_dangling_node(g, destination);
190
+ }
191
+
192
+ // Normalize all source edge weights
193
+ for (edges = destination->source_edges; edges != NULL; edges = edges->next) {
194
+ e = edges->edge;
195
+ source = e->source;
196
+ e->weight = e->weight / source->outbound_weight_total;
197
+ }
198
+
199
+ // Set the initial rank
200
+ destination->prev_rank = 0;
201
+ destination->rank = 1.0 / g->node_count;
202
+ }
203
+ }
204
+
205
+ void calculate_step(Graph g, double damping) {
206
+ NodeList nodes, dangling_nodes;
207
+ Node source, destination;
208
+ EdgeList edges;
209
+ Edge e;
210
+ double sum;
211
+
212
+ // Set prev rank to rank for all nodes
213
+ for (nodes = g->nodes; nodes != NULL; nodes = nodes->next) {
214
+ destination = nodes->node;
215
+ destination->prev_rank = destination->rank;
216
+ }
217
+
218
+ // Re-destribute the rankings according to weight
219
+ for (nodes = g->nodes; nodes != NULL; nodes = nodes->next) {
220
+ destination = nodes->node;
221
+ sum = 0.0;
222
+ for (edges = destination->source_edges; edges != NULL; edges = edges->next) {
223
+ e = edges->edge;
224
+ source = e->source;
225
+ sum += source->prev_rank * e->weight;
226
+ }
227
+ for (dangling_nodes = g->dangling_nodes; dangling_nodes != NULL; dangling_nodes = dangling_nodes->next) {
228
+ source = dangling_nodes->node;
229
+ sum += source->prev_rank / g->node_count;
230
+ }
231
+ destination->rank = damping * sum + (1 - damping) / g->node_count;
232
+ }
233
+ }
234
+
235
+ // Calculate the Euclidean distance from prev_rank to rank across all nodes
236
+ double prev_distance(Graph g) {
237
+ NodeList nodes;
238
+ Node n;
239
+ double rank_diff, sum_squares = 0.0;
240
+
241
+ for (nodes = g->nodes; nodes != NULL; nodes = nodes->next) {
242
+ n = nodes->node;
243
+ rank_diff = n->prev_rank - n->rank;
244
+ sum_squares += rank_diff * rank_diff;
245
+ }
246
+
247
+ return sqrt(sum_squares);
248
+ }
249
+
250
+ void calculate(Graph g, int max_iterations, double damping, double tolerance) {
251
+ calculate_start(g);
252
+
253
+ while (max_iterations != 0) { // If negative one, allow to go without limit
254
+ calculate_step(g, damping);
255
+ if (prev_distance(g) < tolerance) {
256
+ break;
257
+ }
258
+ max_iterations--;
259
+ }
260
+ }
261
+
262
+ int node_compare(const void *v1, const void *v2) {
263
+ double rank1, rank2, cmp;
264
+
265
+ rank1 = (*(Node *)v1)->rank;
266
+ rank2 = (*(Node *)v2)->rank;
267
+ cmp = rank2 - rank1; // Decreasing order
268
+ if (cmp < 0) return -1;
269
+ if (cmp > 0) return 1;
270
+ return 0;
271
+ }
272
+
273
+ void sort_and_normalize_ranks(Graph g, void (*callback)(VALUE, VALUE, double), VALUE callback_arg) {
274
+ NodeList nodes;
275
+ Node n;
276
+ double sum = 0.0;
277
+ unsigned long i;
278
+ Node *tmp;
279
+
280
+ i = g->node_count;
281
+ tmp = malloc(g->node_count * sizeof(Node));
282
+ for (nodes = g->nodes; nodes != NULL; nodes = nodes->next) {
283
+ n = nodes->node;
284
+ tmp[--i] = n;
285
+ sum += n->rank;
286
+ }
287
+
288
+ qsort(tmp, g->node_count, sizeof(Node), node_compare);
289
+
290
+ for (i = 0; i < g->node_count; i++) {
291
+ n = tmp[i];
292
+ callback(callback_arg, n->label, n->rank / sum);
293
+ }
294
+
295
+ free(tmp);
296
+ }
@@ -0,0 +1,93 @@
1
+ #ifndef PAGE_RANK_SPARSE_NATIVE_H
2
+ #define PAGE_RANK_SPARSE_NATIVE_H
3
+
4
+ #include <ruby.h>
5
+
6
+ struct NodeListStruct;
7
+ typedef struct NodeListStruct* NodeList;
8
+
9
+ typedef struct NodeListStruct {
10
+ struct NodeStruct *node;
11
+ struct NodeListStruct *next;
12
+ } NodeListStruct;
13
+
14
+ //////////////////////////////////////////////////////////////////////////////////////
15
+
16
+ struct EdgeListStruct;
17
+ typedef struct EdgeListStruct* EdgeList;
18
+
19
+ typedef struct EdgeListStruct {
20
+ struct EdgeStruct *edge;
21
+ struct EdgeListStruct *next;
22
+ } EdgeListStruct;
23
+
24
+ //////////////////////////////////////////////////////////////////////////////////////
25
+
26
+ struct NodeStruct;
27
+ typedef struct NodeStruct* Node;
28
+
29
+ typedef struct NodeStruct {
30
+ EdgeList source_edges;
31
+ VALUE label;
32
+ double prev_rank;
33
+ double rank;
34
+ double outbound_weight_total;
35
+ } NodeStruct;
36
+
37
+ //////////////////////////////////////////////////////////////////////////////////////
38
+
39
+ struct EdgeStruct;
40
+ typedef struct EdgeStruct* Edge;
41
+
42
+ typedef struct EdgeStruct {
43
+ Node source;
44
+ double weight;
45
+ } EdgeStruct;
46
+
47
+ //////////////////////////////////////////////////////////////////////////////////////
48
+
49
+ struct GraphStruct;
50
+ typedef struct GraphStruct* Graph;
51
+
52
+ typedef struct GraphStruct {
53
+ unsigned long node_count;
54
+ NodeList nodes;
55
+ NodeList dangling_nodes;
56
+ st_table *node_lookup;
57
+ } GraphStruct;
58
+
59
+ //////////////////////////////////////////////////////////////////////////////////////
60
+
61
+ void free_graph(void *data);
62
+ void free_node(Node n);
63
+ void free_node_list(NodeList nodes, void (*free_item)(Node));
64
+ void free_edge(Edge e);
65
+ void free_edge_list(EdgeList edges, void (*free_item)(Edge));
66
+
67
+ //////////////////////////////////////////////////////////////////////////////////////
68
+
69
+ Node add_node(Graph g, VALUE label);
70
+ Node add_dangling_node(Graph g, Node n);
71
+ Edge add_edge(Node source, Node destination, double weight);
72
+ Edge add_edge_with_labels(Graph g, VALUE source_label, VALUE dest_label, double weight);
73
+ Node lookup_node(Graph g, VALUE label);
74
+
75
+ //////////////////////////////////////////////////////////////////////////////////////
76
+
77
+ void calculate_start(Graph g);
78
+ void calculate_step(Graph g, double damping);
79
+ double prev_distance(Graph g);
80
+ void calculate(Graph g, int max_iterations, double damping, double tolerance);
81
+ int node_compare(const void *v1, const void *v2);
82
+ void sort_and_normalize_ranks(Graph g, void (*callback)(VALUE, VALUE, double), VALUE callback_arg);
83
+
84
+ //////////////////////////////////////////////////////////////////////////////////////
85
+
86
+ void Init_sparse_native();
87
+ VALUE sparse_native_allocate(VALUE self);
88
+ VALUE sparse_native_add_edge(VALUE self, VALUE source, VALUE dest, VALUE weight);
89
+ VALUE sparse_native_calculate(VALUE self, VALUE max_iterations, VALUE damping, VALUE tolerance);
90
+ VALUE sorted_and_normalized_ranks(Graph g);
91
+ void rb_hash_dset(VALUE hash, VALUE key, double value);
92
+
93
+ #endif