replacer_bot 0.0.7 → 0.0.8

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d9cc0084fcde5c232d3ffe37093ef1c95e1556e5
4
- data.tar.gz: 0363fabbc0feb1e3a9b8a2fb08a06cf207120628
3
+ metadata.gz: b1e8d3be70c7dfae8451a8a7e6655bb74bb545a9
4
+ data.tar.gz: 062f05a23f866b7eb168f95160b7dc5d07a4632d
5
5
  SHA512:
6
- metadata.gz: 506d9982aa39ac4198556ed6b42bed26d6305f117a64bccb8df617b45eb74f311e3c259a46c237dc9bcbb5dd29ab98334fa5174d24b66c1e5d1545dc235e45a5
7
- data.tar.gz: 0e9bacffd8e373d1e05c7082d90fbbdb7f0c662e8b49eeef00ff4a96f33a88316955036c6e366aaf70ea4b1ac7af830d04f955133723ceefcd2794a64d9a7065
6
+ metadata.gz: 830a1b44dfc123d1ea276ef70deecd5f80276dd9900f43bc6d6c3d7024f87387b0b2acedd26197b028ba8d83509438bb354366bd555219fcf97aa5cf4725c04a
7
+ data.tar.gz: 1147ca7c0ec8a6dc588f68535d4fcd0d59f0d41d6844b87197a7e07a6df7a5a3b716a77be82506c4a3dd35b7b0c5e0443d9c4ac759f27d6644a0d16bbdc9c113
data/config/defaults.yml CHANGED
@@ -7,3 +7,4 @@ replacements:
7
7
  - open data: Taylor Swift
8
8
  - opendata: TaylorSwift
9
9
  interval: 60 #seconds
10
+ similarity_weighting: 4 # how many words a tweet must overlap already-seen tweets by for us to consider it 'similar'
@@ -3,13 +3,37 @@ module ReplacerBot
3
3
  def self.validate tweet
4
4
  archive = retrieve
5
5
  t = sanitise tweet
6
- valid = not(archive.include? t)
6
+ valid = not(archive.include? t) && not(similar_to_archive tweet, archive)
7
7
  archive.add t
8
8
  save archive
9
9
 
10
10
  valid
11
11
  end
12
12
 
13
+ def self.similar_to_archive tweet, archive
14
+ match = false
15
+
16
+ archive.each do |archived_tweet|
17
+ match = true if similar(tweet, archived_tweet)
18
+ end
19
+
20
+ match
21
+ end
22
+
23
+ def self.similar tweet, other_tweet, weighting: Config.instance.config.similarity_weighting
24
+ tweet_words = tweet.split ' '
25
+ return false if tweet_words.count < weighting
26
+
27
+ match = false
28
+
29
+ (tweet_words.count - (weighting - 1)).times do |i|
30
+ sample = tweet_words[i, weighting].join(' ').downcase
31
+ match = true if sanitise(other_tweet.downcase).index sanitise(sample)
32
+ end
33
+
34
+ match
35
+ end
36
+
13
37
  def self.retrieve
14
38
  begin
15
39
  Marshal.load File.open Config.instance.config.seen_tweets
@@ -1,3 +1,3 @@
1
1
  module ReplacerBot
2
- VERSION = "0.0.7"
2
+ VERSION = "0.0.8"
3
3
  end
@@ -60,7 +60,7 @@ module ReplacerBot
60
60
 
61
61
  it 'filters similar tweets', :vcr do
62
62
  SeenTweets.validate 'How open data can help save lives http://t.co/90U7bVq5UF'
63
- expect(replacer.tweets.count).to eq 19
63
+ expect(replacer.tweets.count).to eq 16
64
64
  end
65
65
  end
66
66
 
@@ -75,7 +75,7 @@ module ReplacerBot
75
75
  end
76
76
 
77
77
  it 'actually sends tweets', :vcr do
78
- expect(replacer.client).to(receive(:update)).exactly(18).times
78
+ expect(replacer.client).to(receive(:update)).exactly(16).times
79
79
  interval = replacer.config.interval
80
80
  replacer.config.interval = 0
81
81
  replacer.tweet
@@ -84,6 +84,30 @@ module ReplacerBot
84
84
  ]
85
85
  end
86
86
 
87
+ context 'overlap of words' do
88
+ # n is set in the default config, a lower value makes the bot less noisy at the risk of false negatives
89
+ it 'does not match on tweets with fewer than n words' do
90
+ expect(described_class.similar 'appears to match', 'You would think this appears to match').to eq false
91
+ end
92
+
93
+ it 'sees tweets which overlap by at least n words as similar' do
94
+ expect(described_class.similar 'This is a string of words', 'Also this is a string of words innit').to eq true
95
+ expect(described_class.similar 'This is a string of words', 'Also this is a similar string similar words innit').to eq false
96
+ expect(described_class.similar 'This one will be a definite match ', 'So this one will be a definite match no doubt').to eq true
97
+ end
98
+
99
+ it 'deals sensibly with URLs and hashtags' do
100
+ expect(described_class.similar 'This one has a http://taylor.swift in it', 'So this one has a http://other.url/ in it here').to eq true
101
+ end
102
+
103
+ it 'works on real-world data' do
104
+ expect(described_class.
105
+ similar 'Netflix Releases Taylor Swift-Fetching Developer Preview: Netflix has released a developer preview of its in-house… bit.ly/1JfRdgA',
106
+ 'Netflix Releases Taylor Swift-Fetching Developer Preview - Netflix has released a developer preview of its in-house d...'
107
+ ).to eq true
108
+ end
109
+ end
110
+
87
111
  it 'saves a set' do
88
112
  set = Set.new [1, 2, 3]
89
113
  described_class.save set
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: replacer_bot
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.7
4
+ version: 0.0.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - pikesley