text_rank 1.2.3 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. checksums.yaml +4 -4
  2. data/.codeclimate.yml +1 -1
  3. data/.gitignore +4 -0
  4. data/.rubocop.yml +7 -0
  5. data/.ruby-version +1 -1
  6. data/.travis.yml +1 -0
  7. data/Rakefile +5 -0
  8. data/bin/console +3 -3
  9. data/ext/text_rank/extconf.rb +3 -0
  10. data/ext/text_rank/page_rank_sparse_native.c +300 -0
  11. data/ext/text_rank/page_rank_sparse_native.h +93 -0
  12. data/ext/text_rank/text_rank.c +5 -0
  13. data/lib/page_rank/base.rb +12 -9
  14. data/lib/page_rank/dense.rb +3 -2
  15. data/lib/page_rank/sparse.rb +6 -7
  16. data/lib/page_rank/sparse_native.rb +21 -0
  17. data/lib/page_rank.rb +7 -4
  18. data/lib/text_rank/char_filter/ascii_folding.rb +5 -1
  19. data/lib/text_rank/char_filter/strip_possessive.rb +2 -2
  20. data/lib/text_rank/char_filter/undo_contractions.rb +1 -137
  21. data/lib/text_rank/char_filter/undo_contractions.yml +135 -0
  22. data/lib/text_rank/char_filter.rb +1 -1
  23. data/lib/text_rank/fingerprint.rb +10 -18
  24. data/lib/text_rank/fingerprint_overlap.rb +55 -0
  25. data/lib/text_rank/graph_strategy/coocurrence.rb +15 -6
  26. data/lib/text_rank/keyword_extractor.rb +32 -25
  27. data/lib/text_rank/rank_filter/collapse_adjacent.rb +48 -25
  28. data/lib/text_rank/rank_filter/normalize_probability.rb +2 -1
  29. data/lib/text_rank/rank_filter/normalize_unit_vector.rb +2 -1
  30. data/lib/text_rank/token_filter/part_of_speech.rb +0 -1
  31. data/lib/text_rank/token_filter/stopwords.rb +1 -321
  32. data/lib/text_rank/token_filter/stopwords.yml +317 -0
  33. data/lib/text_rank/tokenizer/money.rb +11 -6
  34. data/lib/text_rank/tokenizer/number.rb +4 -3
  35. data/lib/text_rank/tokenizer/punctuation.rb +4 -1
  36. data/lib/text_rank/tokenizer/url.rb +3 -0
  37. data/lib/text_rank/tokenizer/whitespace.rb +4 -1
  38. data/lib/text_rank/tokenizer/word.rb +5 -2
  39. data/lib/text_rank/tokenizer.rb +1 -1
  40. data/lib/text_rank/version.rb +3 -1
  41. data/lib/text_rank.rb +14 -9
  42. data/text_rank.gemspec +4 -1
  43. metadata +48 -12
@@ -1,13 +1,17 @@
1
- # coding: utf-8
2
1
  module TextRank
3
2
  module CharFilter
4
3
  ##
5
4
  # Characater filter to transform non-ASCII (unicode) characters into ASCII-friendly versions.
6
5
  #
6
+ # rubocop:disable Style/AsciiComments
7
+ #
7
8
  # = Example
8
9
  #
9
10
  # AsciiFolding.new.filter!("the Perigordian Abbé then made answer, because a poor beggar of the country of Atrébatie heard some foolish things said")
10
11
  # => "the Perigordian Abbe then made answer, because a poor beggar of the country of Atrebatie heard some foolish things said"
12
+ #
13
+ # rubocop:enable Style/AsciiComments
14
+ #
11
15
  ##
12
16
  class AsciiFolding
13
17
 
@@ -5,7 +5,7 @@ module TextRank
5
5
  #
6
6
  # = Example
7
7
  #
8
- # StripPosessive.new.filter!("to loathe ones very being and yet to hold it fast")
8
+ # StripPosessive.new.filter!("to loathe one's very being and yet to hold it fast")
9
9
  # => "to loathe one very being and yet to hold it fast"
10
10
  ##
11
11
  class StripPossessive
@@ -15,7 +15,7 @@ module TextRank
15
15
  # @return [String]
16
16
  def filter!(text)
17
17
  text.gsub!(/([a-z]+)'s\b/) do
18
- $1
18
+ Regexp.last_match(1)
19
19
  end
20
20
  end
21
21
 
@@ -11,143 +11,7 @@ module TextRank
11
11
  class UndoContractions
12
12
 
13
13
  # List of English contractions to undo
14
- CONTRACTIONS = {
15
- "ain't" => "am not",
16
- "amn't" => "am not",
17
- "aren't" => "are not",
18
- "can't" => "can not",
19
- "could've" => "could have",
20
- "couldn't" => "could not",
21
- "couldn't've" => "could not have",
22
- "didn't" => "did not",
23
- "doesn't" => "does not",
24
- "don't" => "do not",
25
- "gonna" => "going to",
26
- "hadn't" => "had not",
27
- "hadn't've" => "had not have",
28
- "hasn't" => "has not",
29
- "haven't" => "have not",
30
- "he'd" => "he had",
31
- "he'd've" => "he would have",
32
- "he'll" => "he shall",
33
- "he's" => "he has",
34
- "he'sn't" => "he has not",
35
- "how'd" => "how did",
36
- "how'll" => "how will",
37
- "how's" => "how has",
38
- "i'd" => "i had",
39
- "i'd've" => "i would have",
40
- "i'll" => "i shall",
41
- "i'm" => "i am",
42
- "i've" => "i have",
43
- "i'ven't" => "i have not",
44
- "isn't" => "is not",
45
- "it'd" => "it had",
46
- "it'd've" => "it would have",
47
- "it'll" => "it shall",
48
- "it's" => "it has",
49
- "it'sn't" => "it has not",
50
- "let's" => "let us",
51
- "ma'am" => "madam",
52
- "mightn't" => "might not",
53
- "mightn't've" => "might not have",
54
- "might've" => "might have",
55
- "mustn't" => "must not",
56
- "must've" => "must have",
57
- "needn't" => "need not",
58
- "not've" => "not have",
59
- "o'clock" => "of the clock",
60
- "ol'" => "old",
61
- "oughtn't" => "ought not",
62
- "shan't" => "shall not",
63
- "she'd" => "she had",
64
- "she'd've" => "she would have",
65
- "she'll" => "she shall",
66
- "she's" => "she has",
67
- "she'sn't" => "she has not",
68
- "should've" => "should have",
69
- "shouldn't" => "should not",
70
- "shouldn't've" => "should not have",
71
- "somebody'd" => "somebody had",
72
- "somebody'd've" => "somebody would have",
73
- "somebody'dn't've" => "somebody would not have",
74
- "somebody'll" => "somebody shall",
75
- "somebody's" => "somebody has",
76
- "someone'd" => "someone had",
77
- "someone'd've" => "someone would have",
78
- "someone'll" => "someone shall",
79
- "someone's" => "someone has",
80
- "something'd" => "something had",
81
- "something'd've" => "something would have",
82
- "something'll" => "something shall",
83
- "something's" => "something has",
84
- "'sup" => "what's up",
85
- "that'll" => "that will",
86
- "that's" => "that has",
87
- "there'd" => "there had",
88
- "there'd've" => "there would have",
89
- "there're" => "there are",
90
- "there's" => "there has",
91
- "they'd" => "they had",
92
- "they'dn't" => "they would not",
93
- "they'dn't've" => "they would not have",
94
- "they'd've" => "they would have",
95
- "they'd'ven't" => "they would have not",
96
- "they'll" => "they shall",
97
- "they'lln't've" => "they will not have",
98
- "they'll'ven't" => "they will have not",
99
- "they're" => "they are",
100
- "they've" => "they have",
101
- "they'ven't" => "they have not",
102
- "'tis" => "it is",
103
- "'twas" => "it was",
104
- "wanna" => "want to",
105
- "wasn't" => "was not",
106
- "we'd" => "we had",
107
- "we'd've" => "we would have",
108
- "we'dn't've" => "we would not have",
109
- "we'll" => "we will",
110
- "we'lln't've" => "we will not have",
111
- "we're" => "we are",
112
- "we've" => "we have",
113
- "weren't" => "were not",
114
- "what'll" => "what shall",
115
- "what're" => "what are",
116
- "what's" => "what has",
117
- "what've" => "what have",
118
- "when's" => "when has",
119
- "where'd" => "where did",
120
- "where's" => "where has",
121
- "where've" => "where have",
122
- "who'd" => "who would",
123
- "who'd've" => "who would have",
124
- "who'll" => "who shall",
125
- "who're" => "who are",
126
- "who's" => "who has",
127
- "who've" => "who have",
128
- "why'll" => "why will",
129
- "why're" => "why are",
130
- "why's" => "why has",
131
- "won't" => "will not",
132
- "won't've" => "will not have",
133
- "would've" => "would have",
134
- "wouldn't" => "would not",
135
- "wouldn't've" => "would not have",
136
- "y'all" => "you all",
137
- "y'all'd've" => "you all would have",
138
- "y'all'dn't've" => "you all would not have",
139
- "y'all'll" => "you all will",
140
- "y'all'lln't" => "you all will not",
141
- "y'all'll've" => "you all will have",
142
- "y'all'll'ven't" => "you all will have not",
143
- "you'd" => "you had",
144
- "you'd've" => "you would have",
145
- "you'll" => "you shall",
146
- "you're" => "you are",
147
- "you'ren't" => "you are not",
148
- "you've" => "you have",
149
- "you'ven't" => "you have not",
150
- }
14
+ CONTRACTIONS = YAML.load_file(File.expand_path('undo_contractions.yml', __dir__))
151
15
 
152
16
  # Perform the filter
153
17
  # @param text [String]
@@ -0,0 +1,135 @@
1
+ ain't: am not
2
+ amn't: am not
3
+ aren't: are not
4
+ can't: can not
5
+ could've: could have
6
+ couldn't: could not
7
+ couldn't've: could not have
8
+ didn't: did not
9
+ doesn't: does not
10
+ don't: do not
11
+ gonna: going to
12
+ hadn't: had not
13
+ hadn't've: had not have
14
+ hasn't: has not
15
+ haven't: have not
16
+ he'd: he had
17
+ he'd've: he would have
18
+ he'll: he shall
19
+ he's: he has
20
+ he'sn't: he has not
21
+ how'd: how did
22
+ how'll: how will
23
+ how's: how has
24
+ i'd: i had
25
+ i'd've: i would have
26
+ i'll: i shall
27
+ i'm: i am
28
+ i've: i have
29
+ i'ven't: i have not
30
+ isn't: is not
31
+ it'd: it had
32
+ it'd've: it would have
33
+ it'll: it shall
34
+ it's: it has
35
+ it'sn't: it has not
36
+ let's: let us
37
+ ma'am: madam
38
+ mightn't: might not
39
+ mightn't've: might not have
40
+ might've: might have
41
+ mustn't: must not
42
+ must've: must have
43
+ needn't: need not
44
+ not've: not have
45
+ o'clock: of the clock
46
+ ol': old
47
+ oughtn't: ought not
48
+ shan't: shall not
49
+ she'd: she had
50
+ she'd've: she would have
51
+ she'll: she shall
52
+ she's: she has
53
+ she'sn't: she has not
54
+ should've: should have
55
+ shouldn't: should not
56
+ shouldn't've: should not have
57
+ somebody'd: somebody had
58
+ somebody'd've: somebody would have
59
+ somebody'dn't've: somebody would not have
60
+ somebody'll: somebody shall
61
+ somebody's: somebody has
62
+ someone'd: someone had
63
+ someone'd've: someone would have
64
+ someone'll: someone shall
65
+ someone's: someone has
66
+ something'd: something had
67
+ something'd've: something would have
68
+ something'll: something shall
69
+ something's: something has
70
+ "'sup": "what's up"
71
+ that'll: that will
72
+ that's: that has
73
+ there'd: there had
74
+ there'd've: there would have
75
+ there're: there are
76
+ there's: there has
77
+ they'd: they had
78
+ they'dn't: they would not
79
+ they'dn't've: they would not have
80
+ they'd've: they would have
81
+ they'd'ven't: they would have not
82
+ they'll: they shall
83
+ they'lln't've: they will not have
84
+ they'll'ven't: they will have not
85
+ they're: they are
86
+ they've: they have
87
+ they'ven't: they have not
88
+ "'tis": it is
89
+ "'twas": it was
90
+ wanna: want to
91
+ wasn't: was not
92
+ we'd: we had
93
+ we'd've: we would have
94
+ we'dn't've: we would not have
95
+ we'll: we will
96
+ we'lln't've: we will not have
97
+ we're: we are
98
+ we've: we have
99
+ weren't: were not
100
+ what'll: what shall
101
+ what're: what are
102
+ what's: what has
103
+ what've: what have
104
+ when's: when has
105
+ where'd: where did
106
+ where's: where has
107
+ where've: where have
108
+ who'd: who would
109
+ who'd've: who would have
110
+ who'll: who shall
111
+ who're: who are
112
+ who's: who has
113
+ who've: who have
114
+ why'll: why will
115
+ why're: why are
116
+ why's: why has
117
+ won't: will not
118
+ won't've: will not have
119
+ would've: would have
120
+ wouldn't: would not
121
+ wouldn't've: would not have
122
+ y'all: you all
123
+ y'all'd've: you all would have
124
+ y'all'dn't've: you all would not have
125
+ y'all'll: you all will
126
+ y'all'lln't: you all will not
127
+ y'all'll've: you all will have
128
+ y'all'll'ven't: you all will have not
129
+ you'd: you had
130
+ you'd've: you would have
131
+ you'll: you shall
132
+ you're: you are
133
+ you'ren't: you are not
134
+ you've: you have
135
+ you'ven't: you have not
@@ -7,7 +7,7 @@ module TextRank
7
7
  # converting non-ascii characters to related ascii characters, forcing text to
8
8
  # lower case, stripping out HTML, converting English contractions (e.g. "won't")
9
9
  # to the non-contracted form ("will not"), and more.
10
- #
10
+ #
11
11
  # Character filters are applied as a chain, so care should be taken to use them
12
12
  # in the desired order.
13
13
  ##
@@ -1,5 +1,3 @@
1
- require 'set'
2
-
3
1
  module TextRank
4
2
  ##
5
3
  # Class used to compare documents according to TextRank. A "fingerprint"
@@ -61,28 +59,22 @@ module TextRank
61
59
  # Calculates the "similarity" between this fingerprint and another
62
60
  # @param {Fingerprint} A second fingerprint to compare
63
61
  # @return [Number] A number between 0.0 (different) and 1.0 (same)
64
- def similarity(trf2)
65
- return 1.0 if values == trf2.values
66
-
67
- sim = 0
68
- s1 = Set.new
69
- s2 = Set.new
62
+ def similarity(other)
63
+ return 1.0 if values == other.values # Short-circuit for efficiency
70
64
 
71
- [size, trf2.size].max.times.reduce(0) do |sum, i|
72
- v1 = values[i]
73
- v2 = trf2.values[i]
74
- if v1 == v2
75
- sim += 1
76
- else
77
- s1.delete?(v2) ? (sim += 1) : (s2 << v2)
78
- s2.delete?(v1) ? (sim += 1) : (s1 << v1)
79
- end
80
- sum + sim * linear_transform[i]
65
+ sum = 0
66
+ overlap(other).each_with_index do |overlap_value, i|
67
+ sum += overlap_value * linear_transform[i]
81
68
  end
69
+ sum
82
70
  end
83
71
 
84
72
  private
85
73
 
74
+ def overlap(other)
75
+ FingerprintOverlap.new(values, other.values).overlap
76
+ end
77
+
86
78
  def linear_transform
87
79
  @linear_transform ||= size.times.map do |i|
88
80
  1.0 / Math.log(i + 2) / size.to_f / norm_factor
@@ -0,0 +1,55 @@
1
+ module TextRank
2
+ ##
3
+ # Determines "overlap" between two fingerprints at each N prefixes
4
+ #
5
+ # For example,
6
+ #
7
+ # FingerprintOverlap.new(
8
+ # %w[a b c d],
9
+ # %w[b e a c],
10
+ # ).overlap
11
+ #
12
+ # => [
13
+ # 0, # [a] & (b) have no overlap
14
+ # 1, # [a b] & [b e] have one overlap: b
15
+ # 2, # [a b c] & [b e a] have two overlap: a & b
16
+ # 3, # [a b c d] & [b e a c] have three overlap: a, b, & c
17
+ # ]
18
+ ##
19
+ class FingerprintOverlap
20
+
21
+ attr_reader :overlap
22
+
23
+ def initialize(values1, values2)
24
+ raise ArgumentError, 'Value size mismatch' if values1.size != values2.size
25
+
26
+ @encountered1 = Set.new
27
+ @encountered2 = Set.new
28
+ @overlap_count = 0
29
+
30
+ @overlap = determine_overlap(values1, values2)
31
+ end
32
+
33
+ private
34
+
35
+ def determine_overlap(values1, values2)
36
+ values1.zip(values2).map do |v1, v2|
37
+ encounter(v1, v2)
38
+ @overlap_count
39
+ end
40
+ end
41
+
42
+ # This algorithm is a little more complex than could be represented in Ruby,
43
+ # but we want to keep it as performant as possible.
44
+ def encounter(value1, value2)
45
+ if value1 == value2
46
+ @overlap_count += 1
47
+ else
48
+ # Delete from the set in case an element appears more than once
49
+ @encountered1.delete?(value2) ? (@overlap_count += 1) : (@encountered2 << value2)
50
+ @encountered2.delete?(value1) ? (@overlap_count += 1) : (@encountered1 << value1)
51
+ end
52
+ end
53
+
54
+ end
55
+ end
@@ -61,18 +61,27 @@ module TextRank
61
61
  # return [nil]
62
62
  def build_graph(tokens, graph)
63
63
  ngram_window = @ngram_size * 2 + 1
64
- tokens.each_with_index do |token_i, i|
64
+ tokens.size.times do |i|
65
65
  ngram_window.times do |j|
66
- next if j == @ngram_size || i + j < @ngram_size
67
- token_j = tokens[i - @ngram_size + j]
68
- if token_j
69
- graph.add(token_i, token_j, weight: 1.0 / (j - @ngram_size).abs)
70
- end
66
+ consider_ngram_window(tokens, graph, i, j)
71
67
  end
72
68
  end
73
69
  nil
74
70
  end
75
71
 
72
+ private
73
+
74
+ def consider_ngram_window(tokens, graph, i, j)
75
+ return if j == @ngram_size || i + j < @ngram_size
76
+
77
+ token_i = tokens[i]
78
+ token_j = tokens[i - @ngram_size + j]
79
+
80
+ if token_j
81
+ graph.add(token_i, token_j, weight: 1.0 / (j - @ngram_size).abs)
82
+ end
83
+ end
84
+
76
85
  end
77
86
  end
78
87
  end
@@ -13,9 +13,9 @@ module TextRank
13
13
  # @return [KeywordExtractor]
14
14
  def self.basic(**options)
15
15
  new(**{
16
- char_filters: [:AsciiFolding, :Lowercase],
17
- tokenizers: [:Word],
18
- token_filters: [:Stopwords, :MinLength],
16
+ char_filters: %i[AsciiFolding Lowercase],
17
+ tokenizers: %i[Word],
18
+ token_filters: %i[Stopwords MinLength],
19
19
  graph_strategy: :Coocurrence,
20
20
  }.merge(options))
21
21
  end
@@ -25,11 +25,11 @@ module TextRank
25
25
  # @return [KeywordExtractor]
26
26
  def self.advanced(**options)
27
27
  new(**{
28
- char_filters: [:AsciiFolding, :Lowercase, :StripHtml, :StripEmail, :UndoContractions, :StripPossessive],
29
- tokenizers: [:Url, :Money, :Number, :Word, :Punctuation],
30
- token_filters: [:PartOfSpeech, :Stopwords, :MinLength],
28
+ char_filters: %i[AsciiFolding Lowercase StripHtml StripEmail UndoContractions StripPossessive],
29
+ tokenizers: %i[Url Money Number Word Punctuation],
30
+ token_filters: %i[PartOfSpeech Stopwords MinLength],
31
31
  graph_strategy: :Coocurrence,
32
- rank_filters: [:CollapseAdjacent, :NormalizeUnitVector, :SortByValue],
32
+ rank_filters: %i[CollapseAdjacent NormalizeUnitVector SortByValue],
33
33
  }.merge(options))
34
34
  end
35
35
 
@@ -41,14 +41,14 @@ module TextRank
41
41
  # @option options [Array<Class, Symbol, #filter!>] :rank_filters A list of filters to be applied to the keyword ranks after keyword extraction
42
42
  def initialize(**options)
43
43
  @page_rank_options = {
44
- strategy: options[:strategy] || :sparse,
45
- damping: options[:damping],
44
+ strategy: options[:strategy] || :sparse,
45
+ damping: options[:damping],
46
46
  tolerance: options[:tolerance],
47
47
  }
48
- @char_filters = options[:char_filters] || []
49
- @tokenizers = options[:tokenizers] || [Tokenizer::Word]
50
- @token_filters = options[:token_filters] || []
51
- @rank_filters = options[:rank_filters] || []
48
+ @char_filters = options[:char_filters] || []
49
+ @tokenizers = options[:tokenizers] || [Tokenizer::Word]
50
+ @token_filters = options[:token_filters] || []
51
+ @rank_filters = options[:rank_filters] || []
52
52
  @graph_strategy = options[:graph_strategy] || GraphStrategy::Coocurrence
53
53
  end
54
54
 
@@ -73,9 +73,7 @@ module TextRank
73
73
  # Sets the graph strategy for producing a graph from tokens
74
74
  # @param strategy [Class, Symbol, #build_graph] Strategy for producing a graph from tokens
75
75
  # @return [Class, Symbol, #build_graph]
76
- def graph_strategy=(strategy)
77
- @graph_strategy = strategy
78
- end
76
+ attr_writer :graph_strategy
79
77
 
80
78
  # Add a new TokenFilter for processing tokens after tokenization
81
79
  # @param filter [Class, Symbol, #filter!] A filter to process tokens after tokenization
@@ -105,14 +103,23 @@ module TextRank
105
103
  end
106
104
 
107
105
  # Filter & tokenize text, and return PageRank
108
- # @param text [String] unfiltered text to be processed
106
+ # @param text [String,Array<String>] unfiltered text to be processed
109
107
  # @return [Hash<String, Float>] tokens and page ranks (in descending order)
110
108
  def extract(text, **options)
111
- tokens = tokenize(text)
109
+ text = Array(text)
110
+ tokens_per_text = text.map do |t|
111
+ tokenize(t)
112
+ end
112
113
  graph = PageRank.new(**@page_rank_options)
113
- classify(@graph_strategy, context: GraphStrategy).build_graph(tokens, graph)
114
+ strategy = classify(@graph_strategy, context: GraphStrategy)
115
+ tokens_per_text.each do |tokens|
116
+ strategy.build_graph(tokens, graph)
117
+ end
114
118
  ranks = graph.calculate(**options)
115
- apply_rank_filters(ranks, tokens: tokens, original_text: text)
119
+ tokens_per_text.each_with_index do |tokens, i|
120
+ ranks = apply_rank_filters(ranks, tokens: tokens, original_text: text[i])
121
+ end
122
+ ranks
116
123
  end
117
124
 
118
125
  private
@@ -153,14 +160,14 @@ module TextRank
153
160
  array.insert(idx, value)
154
161
  end
155
162
 
156
- def classify(c, context: self)
157
- case c
163
+ def classify(clazz, context: self)
164
+ case clazz
158
165
  when Class
159
- c.new
166
+ clazz.new
160
167
  when Symbol
161
- context.const_get(c).new
168
+ context.const_get(clazz).new
162
169
  else
163
- c
170
+ clazz
164
171
  end
165
172
  end
166
173