text_rank 1.2.3 → 1.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (43) hide show
  1. checksums.yaml +4 -4
  2. data/.codeclimate.yml +1 -1
  3. data/.gitignore +4 -0
  4. data/.rubocop.yml +7 -0
  5. data/.ruby-version +1 -1
  6. data/.travis.yml +1 -0
  7. data/Rakefile +5 -0
  8. data/bin/console +3 -3
  9. data/ext/text_rank/extconf.rb +3 -0
  10. data/ext/text_rank/page_rank_sparse_native.c +300 -0
  11. data/ext/text_rank/page_rank_sparse_native.h +93 -0
  12. data/ext/text_rank/text_rank.c +5 -0
  13. data/lib/page_rank/base.rb +12 -9
  14. data/lib/page_rank/dense.rb +3 -2
  15. data/lib/page_rank/sparse.rb +6 -7
  16. data/lib/page_rank/sparse_native.rb +21 -0
  17. data/lib/page_rank.rb +7 -4
  18. data/lib/text_rank/char_filter/ascii_folding.rb +5 -1
  19. data/lib/text_rank/char_filter/strip_possessive.rb +2 -2
  20. data/lib/text_rank/char_filter/undo_contractions.rb +1 -137
  21. data/lib/text_rank/char_filter/undo_contractions.yml +135 -0
  22. data/lib/text_rank/char_filter.rb +1 -1
  23. data/lib/text_rank/fingerprint.rb +10 -18
  24. data/lib/text_rank/fingerprint_overlap.rb +55 -0
  25. data/lib/text_rank/graph_strategy/coocurrence.rb +15 -6
  26. data/lib/text_rank/keyword_extractor.rb +32 -25
  27. data/lib/text_rank/rank_filter/collapse_adjacent.rb +48 -25
  28. data/lib/text_rank/rank_filter/normalize_probability.rb +2 -1
  29. data/lib/text_rank/rank_filter/normalize_unit_vector.rb +2 -1
  30. data/lib/text_rank/token_filter/part_of_speech.rb +0 -1
  31. data/lib/text_rank/token_filter/stopwords.rb +1 -321
  32. data/lib/text_rank/token_filter/stopwords.yml +317 -0
  33. data/lib/text_rank/tokenizer/money.rb +11 -6
  34. data/lib/text_rank/tokenizer/number.rb +4 -3
  35. data/lib/text_rank/tokenizer/punctuation.rb +4 -1
  36. data/lib/text_rank/tokenizer/url.rb +3 -0
  37. data/lib/text_rank/tokenizer/whitespace.rb +4 -1
  38. data/lib/text_rank/tokenizer/word.rb +5 -2
  39. data/lib/text_rank/tokenizer.rb +1 -1
  40. data/lib/text_rank/version.rb +3 -1
  41. data/lib/text_rank.rb +14 -9
  42. data/text_rank.gemspec +4 -1
  43. metadata +48 -12
@@ -1,13 +1,17 @@
1
- # coding: utf-8
2
1
  module TextRank
3
2
  module CharFilter
4
3
  ##
5
4
  # Characater filter to transform non-ASCII (unicode) characters into ASCII-friendly versions.
6
5
  #
6
+ # rubocop:disable Style/AsciiComments
7
+ #
7
8
  # = Example
8
9
  #
9
10
  # AsciiFolding.new.filter!("the Perigordian Abbé then made answer, because a poor beggar of the country of Atrébatie heard some foolish things said")
10
11
  # => "the Perigordian Abbe then made answer, because a poor beggar of the country of Atrebatie heard some foolish things said"
12
+ #
13
+ # rubocop:enable Style/AsciiComments
14
+ #
11
15
  ##
12
16
  class AsciiFolding
13
17
 
@@ -5,7 +5,7 @@ module TextRank
5
5
  #
6
6
  # = Example
7
7
  #
8
- # StripPosessive.new.filter!("to loathe ones very being and yet to hold it fast")
8
+ # StripPosessive.new.filter!("to loathe one's very being and yet to hold it fast")
9
9
  # => "to loathe one very being and yet to hold it fast"
10
10
  ##
11
11
  class StripPossessive
@@ -15,7 +15,7 @@ module TextRank
15
15
  # @return [String]
16
16
  def filter!(text)
17
17
  text.gsub!(/([a-z]+)'s\b/) do
18
- $1
18
+ Regexp.last_match(1)
19
19
  end
20
20
  end
21
21
 
@@ -11,143 +11,7 @@ module TextRank
11
11
  class UndoContractions
12
12
 
13
13
  # List of English contractions to undo
14
- CONTRACTIONS = {
15
- "ain't" => "am not",
16
- "amn't" => "am not",
17
- "aren't" => "are not",
18
- "can't" => "can not",
19
- "could've" => "could have",
20
- "couldn't" => "could not",
21
- "couldn't've" => "could not have",
22
- "didn't" => "did not",
23
- "doesn't" => "does not",
24
- "don't" => "do not",
25
- "gonna" => "going to",
26
- "hadn't" => "had not",
27
- "hadn't've" => "had not have",
28
- "hasn't" => "has not",
29
- "haven't" => "have not",
30
- "he'd" => "he had",
31
- "he'd've" => "he would have",
32
- "he'll" => "he shall",
33
- "he's" => "he has",
34
- "he'sn't" => "he has not",
35
- "how'd" => "how did",
36
- "how'll" => "how will",
37
- "how's" => "how has",
38
- "i'd" => "i had",
39
- "i'd've" => "i would have",
40
- "i'll" => "i shall",
41
- "i'm" => "i am",
42
- "i've" => "i have",
43
- "i'ven't" => "i have not",
44
- "isn't" => "is not",
45
- "it'd" => "it had",
46
- "it'd've" => "it would have",
47
- "it'll" => "it shall",
48
- "it's" => "it has",
49
- "it'sn't" => "it has not",
50
- "let's" => "let us",
51
- "ma'am" => "madam",
52
- "mightn't" => "might not",
53
- "mightn't've" => "might not have",
54
- "might've" => "might have",
55
- "mustn't" => "must not",
56
- "must've" => "must have",
57
- "needn't" => "need not",
58
- "not've" => "not have",
59
- "o'clock" => "of the clock",
60
- "ol'" => "old",
61
- "oughtn't" => "ought not",
62
- "shan't" => "shall not",
63
- "she'd" => "she had",
64
- "she'd've" => "she would have",
65
- "she'll" => "she shall",
66
- "she's" => "she has",
67
- "she'sn't" => "she has not",
68
- "should've" => "should have",
69
- "shouldn't" => "should not",
70
- "shouldn't've" => "should not have",
71
- "somebody'd" => "somebody had",
72
- "somebody'd've" => "somebody would have",
73
- "somebody'dn't've" => "somebody would not have",
74
- "somebody'll" => "somebody shall",
75
- "somebody's" => "somebody has",
76
- "someone'd" => "someone had",
77
- "someone'd've" => "someone would have",
78
- "someone'll" => "someone shall",
79
- "someone's" => "someone has",
80
- "something'd" => "something had",
81
- "something'd've" => "something would have",
82
- "something'll" => "something shall",
83
- "something's" => "something has",
84
- "'sup" => "what's up",
85
- "that'll" => "that will",
86
- "that's" => "that has",
87
- "there'd" => "there had",
88
- "there'd've" => "there would have",
89
- "there're" => "there are",
90
- "there's" => "there has",
91
- "they'd" => "they had",
92
- "they'dn't" => "they would not",
93
- "they'dn't've" => "they would not have",
94
- "they'd've" => "they would have",
95
- "they'd'ven't" => "they would have not",
96
- "they'll" => "they shall",
97
- "they'lln't've" => "they will not have",
98
- "they'll'ven't" => "they will have not",
99
- "they're" => "they are",
100
- "they've" => "they have",
101
- "they'ven't" => "they have not",
102
- "'tis" => "it is",
103
- "'twas" => "it was",
104
- "wanna" => "want to",
105
- "wasn't" => "was not",
106
- "we'd" => "we had",
107
- "we'd've" => "we would have",
108
- "we'dn't've" => "we would not have",
109
- "we'll" => "we will",
110
- "we'lln't've" => "we will not have",
111
- "we're" => "we are",
112
- "we've" => "we have",
113
- "weren't" => "were not",
114
- "what'll" => "what shall",
115
- "what're" => "what are",
116
- "what's" => "what has",
117
- "what've" => "what have",
118
- "when's" => "when has",
119
- "where'd" => "where did",
120
- "where's" => "where has",
121
- "where've" => "where have",
122
- "who'd" => "who would",
123
- "who'd've" => "who would have",
124
- "who'll" => "who shall",
125
- "who're" => "who are",
126
- "who's" => "who has",
127
- "who've" => "who have",
128
- "why'll" => "why will",
129
- "why're" => "why are",
130
- "why's" => "why has",
131
- "won't" => "will not",
132
- "won't've" => "will not have",
133
- "would've" => "would have",
134
- "wouldn't" => "would not",
135
- "wouldn't've" => "would not have",
136
- "y'all" => "you all",
137
- "y'all'd've" => "you all would have",
138
- "y'all'dn't've" => "you all would not have",
139
- "y'all'll" => "you all will",
140
- "y'all'lln't" => "you all will not",
141
- "y'all'll've" => "you all will have",
142
- "y'all'll'ven't" => "you all will have not",
143
- "you'd" => "you had",
144
- "you'd've" => "you would have",
145
- "you'll" => "you shall",
146
- "you're" => "you are",
147
- "you'ren't" => "you are not",
148
- "you've" => "you have",
149
- "you'ven't" => "you have not",
150
- }
14
+ CONTRACTIONS = YAML.load_file(File.expand_path('undo_contractions.yml', __dir__))
151
15
 
152
16
  # Perform the filter
153
17
  # @param text [String]
@@ -0,0 +1,135 @@
1
+ ain't: am not
2
+ amn't: am not
3
+ aren't: are not
4
+ can't: can not
5
+ could've: could have
6
+ couldn't: could not
7
+ couldn't've: could not have
8
+ didn't: did not
9
+ doesn't: does not
10
+ don't: do not
11
+ gonna: going to
12
+ hadn't: had not
13
+ hadn't've: had not have
14
+ hasn't: has not
15
+ haven't: have not
16
+ he'd: he had
17
+ he'd've: he would have
18
+ he'll: he shall
19
+ he's: he has
20
+ he'sn't: he has not
21
+ how'd: how did
22
+ how'll: how will
23
+ how's: how has
24
+ i'd: i had
25
+ i'd've: i would have
26
+ i'll: i shall
27
+ i'm: i am
28
+ i've: i have
29
+ i'ven't: i have not
30
+ isn't: is not
31
+ it'd: it had
32
+ it'd've: it would have
33
+ it'll: it shall
34
+ it's: it has
35
+ it'sn't: it has not
36
+ let's: let us
37
+ ma'am: madam
38
+ mightn't: might not
39
+ mightn't've: might not have
40
+ might've: might have
41
+ mustn't: must not
42
+ must've: must have
43
+ needn't: need not
44
+ not've: not have
45
+ o'clock: of the clock
46
+ ol': old
47
+ oughtn't: ought not
48
+ shan't: shall not
49
+ she'd: she had
50
+ she'd've: she would have
51
+ she'll: she shall
52
+ she's: she has
53
+ she'sn't: she has not
54
+ should've: should have
55
+ shouldn't: should not
56
+ shouldn't've: should not have
57
+ somebody'd: somebody had
58
+ somebody'd've: somebody would have
59
+ somebody'dn't've: somebody would not have
60
+ somebody'll: somebody shall
61
+ somebody's: somebody has
62
+ someone'd: someone had
63
+ someone'd've: someone would have
64
+ someone'll: someone shall
65
+ someone's: someone has
66
+ something'd: something had
67
+ something'd've: something would have
68
+ something'll: something shall
69
+ something's: something has
70
+ "'sup": "what's up"
71
+ that'll: that will
72
+ that's: that has
73
+ there'd: there had
74
+ there'd've: there would have
75
+ there're: there are
76
+ there's: there has
77
+ they'd: they had
78
+ they'dn't: they would not
79
+ they'dn't've: they would not have
80
+ they'd've: they would have
81
+ they'd'ven't: they would have not
82
+ they'll: they shall
83
+ they'lln't've: they will not have
84
+ they'll'ven't: they will have not
85
+ they're: they are
86
+ they've: they have
87
+ they'ven't: they have not
88
+ "'tis": it is
89
+ "'twas": it was
90
+ wanna: want to
91
+ wasn't: was not
92
+ we'd: we had
93
+ we'd've: we would have
94
+ we'dn't've: we would not have
95
+ we'll: we will
96
+ we'lln't've: we will not have
97
+ we're: we are
98
+ we've: we have
99
+ weren't: were not
100
+ what'll: what shall
101
+ what're: what are
102
+ what's: what has
103
+ what've: what have
104
+ when's: when has
105
+ where'd: where did
106
+ where's: where has
107
+ where've: where have
108
+ who'd: who would
109
+ who'd've: who would have
110
+ who'll: who shall
111
+ who're: who are
112
+ who's: who has
113
+ who've: who have
114
+ why'll: why will
115
+ why're: why are
116
+ why's: why has
117
+ won't: will not
118
+ won't've: will not have
119
+ would've: would have
120
+ wouldn't: would not
121
+ wouldn't've: would not have
122
+ y'all: you all
123
+ y'all'd've: you all would have
124
+ y'all'dn't've: you all would not have
125
+ y'all'll: you all will
126
+ y'all'lln't: you all will not
127
+ y'all'll've: you all will have
128
+ y'all'll'ven't: you all will have not
129
+ you'd: you had
130
+ you'd've: you would have
131
+ you'll: you shall
132
+ you're: you are
133
+ you'ren't: you are not
134
+ you've: you have
135
+ you'ven't: you have not
@@ -7,7 +7,7 @@ module TextRank
7
7
  # converting non-ascii characters to related ascii characters, forcing text to
8
8
  # lower case, stripping out HTML, converting English contractions (e.g. "won't")
9
9
  # to the non-contracted form ("will not"), and more.
10
- #
10
+ #
11
11
  # Character filters are applied as a chain, so care should be taken to use them
12
12
  # in the desired order.
13
13
  ##
@@ -1,5 +1,3 @@
1
- require 'set'
2
-
3
1
  module TextRank
4
2
  ##
5
3
  # Class used to compare documents according to TextRank. A "fingerprint"
@@ -61,28 +59,22 @@ module TextRank
61
59
  # Calculates the "similarity" between this fingerprint and another
62
60
  # @param {Fingerprint} A second fingerprint to compare
63
61
  # @return [Number] A number between 0.0 (different) and 1.0 (same)
64
- def similarity(trf2)
65
- return 1.0 if values == trf2.values
66
-
67
- sim = 0
68
- s1 = Set.new
69
- s2 = Set.new
62
+ def similarity(other)
63
+ return 1.0 if values == other.values # Short-circuit for efficiency
70
64
 
71
- [size, trf2.size].max.times.reduce(0) do |sum, i|
72
- v1 = values[i]
73
- v2 = trf2.values[i]
74
- if v1 == v2
75
- sim += 1
76
- else
77
- s1.delete?(v2) ? (sim += 1) : (s2 << v2)
78
- s2.delete?(v1) ? (sim += 1) : (s1 << v1)
79
- end
80
- sum + sim * linear_transform[i]
65
+ sum = 0
66
+ overlap(other).each_with_index do |overlap_value, i|
67
+ sum += overlap_value * linear_transform[i]
81
68
  end
69
+ sum
82
70
  end
83
71
 
84
72
  private
85
73
 
74
+ def overlap(other)
75
+ FingerprintOverlap.new(values, other.values).overlap
76
+ end
77
+
86
78
  def linear_transform
87
79
  @linear_transform ||= size.times.map do |i|
88
80
  1.0 / Math.log(i + 2) / size.to_f / norm_factor
@@ -0,0 +1,55 @@
1
+ module TextRank
2
+ ##
3
+ # Determines "overlap" between two fingerprints at each N prefixes
4
+ #
5
+ # For example,
6
+ #
7
+ # FingerprintOverlap.new(
8
+ # %w[a b c d],
9
+ # %w[b e a c],
10
+ # ).overlap
11
+ #
12
+ # => [
13
+ # 0, # [a] & (b) have no overlap
14
+ # 1, # [a b] & [b e] have one overlap: b
15
+ # 2, # [a b c] & [b e a] have two overlap: a & b
16
+ # 3, # [a b c d] & [b e a c] have three overlap: a, b, & c
17
+ # ]
18
+ ##
19
+ class FingerprintOverlap
20
+
21
+ attr_reader :overlap
22
+
23
+ def initialize(values1, values2)
24
+ raise ArgumentError, 'Value size mismatch' if values1.size != values2.size
25
+
26
+ @encountered1 = Set.new
27
+ @encountered2 = Set.new
28
+ @overlap_count = 0
29
+
30
+ @overlap = determine_overlap(values1, values2)
31
+ end
32
+
33
+ private
34
+
35
+ def determine_overlap(values1, values2)
36
+ values1.zip(values2).map do |v1, v2|
37
+ encounter(v1, v2)
38
+ @overlap_count
39
+ end
40
+ end
41
+
42
+ # This algorithm is a little more complex than could be represented in Ruby,
43
+ # but we want to keep it as performant as possible.
44
+ def encounter(value1, value2)
45
+ if value1 == value2
46
+ @overlap_count += 1
47
+ else
48
+ # Delete from the set in case an element appears more than once
49
+ @encountered1.delete?(value2) ? (@overlap_count += 1) : (@encountered2 << value2)
50
+ @encountered2.delete?(value1) ? (@overlap_count += 1) : (@encountered1 << value1)
51
+ end
52
+ end
53
+
54
+ end
55
+ end
@@ -61,18 +61,27 @@ module TextRank
61
61
  # return [nil]
62
62
  def build_graph(tokens, graph)
63
63
  ngram_window = @ngram_size * 2 + 1
64
- tokens.each_with_index do |token_i, i|
64
+ tokens.size.times do |i|
65
65
  ngram_window.times do |j|
66
- next if j == @ngram_size || i + j < @ngram_size
67
- token_j = tokens[i - @ngram_size + j]
68
- if token_j
69
- graph.add(token_i, token_j, weight: 1.0 / (j - @ngram_size).abs)
70
- end
66
+ consider_ngram_window(tokens, graph, i, j)
71
67
  end
72
68
  end
73
69
  nil
74
70
  end
75
71
 
72
+ private
73
+
74
+ def consider_ngram_window(tokens, graph, i, j)
75
+ return if j == @ngram_size || i + j < @ngram_size
76
+
77
+ token_i = tokens[i]
78
+ token_j = tokens[i - @ngram_size + j]
79
+
80
+ if token_j
81
+ graph.add(token_i, token_j, weight: 1.0 / (j - @ngram_size).abs)
82
+ end
83
+ end
84
+
76
85
  end
77
86
  end
78
87
  end
@@ -13,9 +13,9 @@ module TextRank
13
13
  # @return [KeywordExtractor]
14
14
  def self.basic(**options)
15
15
  new(**{
16
- char_filters: [:AsciiFolding, :Lowercase],
17
- tokenizers: [:Word],
18
- token_filters: [:Stopwords, :MinLength],
16
+ char_filters: %i[AsciiFolding Lowercase],
17
+ tokenizers: %i[Word],
18
+ token_filters: %i[Stopwords MinLength],
19
19
  graph_strategy: :Coocurrence,
20
20
  }.merge(options))
21
21
  end
@@ -25,11 +25,11 @@ module TextRank
25
25
  # @return [KeywordExtractor]
26
26
  def self.advanced(**options)
27
27
  new(**{
28
- char_filters: [:AsciiFolding, :Lowercase, :StripHtml, :StripEmail, :UndoContractions, :StripPossessive],
29
- tokenizers: [:Url, :Money, :Number, :Word, :Punctuation],
30
- token_filters: [:PartOfSpeech, :Stopwords, :MinLength],
28
+ char_filters: %i[AsciiFolding Lowercase StripHtml StripEmail UndoContractions StripPossessive],
29
+ tokenizers: %i[Url Money Number Word Punctuation],
30
+ token_filters: %i[PartOfSpeech Stopwords MinLength],
31
31
  graph_strategy: :Coocurrence,
32
- rank_filters: [:CollapseAdjacent, :NormalizeUnitVector, :SortByValue],
32
+ rank_filters: %i[CollapseAdjacent NormalizeUnitVector SortByValue],
33
33
  }.merge(options))
34
34
  end
35
35
 
@@ -41,14 +41,14 @@ module TextRank
41
41
  # @option options [Array<Class, Symbol, #filter!>] :rank_filters A list of filters to be applied to the keyword ranks after keyword extraction
42
42
  def initialize(**options)
43
43
  @page_rank_options = {
44
- strategy: options[:strategy] || :sparse,
45
- damping: options[:damping],
44
+ strategy: options[:strategy] || :sparse,
45
+ damping: options[:damping],
46
46
  tolerance: options[:tolerance],
47
47
  }
48
- @char_filters = options[:char_filters] || []
49
- @tokenizers = options[:tokenizers] || [Tokenizer::Word]
50
- @token_filters = options[:token_filters] || []
51
- @rank_filters = options[:rank_filters] || []
48
+ @char_filters = options[:char_filters] || []
49
+ @tokenizers = options[:tokenizers] || [Tokenizer::Word]
50
+ @token_filters = options[:token_filters] || []
51
+ @rank_filters = options[:rank_filters] || []
52
52
  @graph_strategy = options[:graph_strategy] || GraphStrategy::Coocurrence
53
53
  end
54
54
 
@@ -73,9 +73,7 @@ module TextRank
73
73
  # Sets the graph strategy for producing a graph from tokens
74
74
  # @param strategy [Class, Symbol, #build_graph] Strategy for producing a graph from tokens
75
75
  # @return [Class, Symbol, #build_graph]
76
- def graph_strategy=(strategy)
77
- @graph_strategy = strategy
78
- end
76
+ attr_writer :graph_strategy
79
77
 
80
78
  # Add a new TokenFilter for processing tokens after tokenization
81
79
  # @param filter [Class, Symbol, #filter!] A filter to process tokens after tokenization
@@ -105,14 +103,23 @@ module TextRank
105
103
  end
106
104
 
107
105
  # Filter & tokenize text, and return PageRank
108
- # @param text [String] unfiltered text to be processed
106
+ # @param text [String,Array<String>] unfiltered text to be processed
109
107
  # @return [Hash<String, Float>] tokens and page ranks (in descending order)
110
108
  def extract(text, **options)
111
- tokens = tokenize(text)
109
+ text = Array(text)
110
+ tokens_per_text = text.map do |t|
111
+ tokenize(t)
112
+ end
112
113
  graph = PageRank.new(**@page_rank_options)
113
- classify(@graph_strategy, context: GraphStrategy).build_graph(tokens, graph)
114
+ strategy = classify(@graph_strategy, context: GraphStrategy)
115
+ tokens_per_text.each do |tokens|
116
+ strategy.build_graph(tokens, graph)
117
+ end
114
118
  ranks = graph.calculate(**options)
115
- apply_rank_filters(ranks, tokens: tokens, original_text: text)
119
+ tokens_per_text.each_with_index do |tokens, i|
120
+ ranks = apply_rank_filters(ranks, tokens: tokens, original_text: text[i])
121
+ end
122
+ ranks
116
123
  end
117
124
 
118
125
  private
@@ -153,14 +160,14 @@ module TextRank
153
160
  array.insert(idx, value)
154
161
  end
155
162
 
156
- def classify(c, context: self)
157
- case c
163
+ def classify(clazz, context: self)
164
+ case clazz
158
165
  when Class
159
- c.new
166
+ clazz.new
160
167
  when Symbol
161
- context.const_get(c).new
168
+ context.const_get(clazz).new
162
169
  else
163
- c
170
+ clazz
164
171
  end
165
172
  end
166
173