replacer_bot 0.0.7 → 0.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d9cc0084fcde5c232d3ffe37093ef1c95e1556e5
4
- data.tar.gz: 0363fabbc0feb1e3a9b8a2fb08a06cf207120628
3
+ metadata.gz: b1e8d3be70c7dfae8451a8a7e6655bb74bb545a9
4
+ data.tar.gz: 062f05a23f866b7eb168f95160b7dc5d07a4632d
5
5
  SHA512:
6
- metadata.gz: 506d9982aa39ac4198556ed6b42bed26d6305f117a64bccb8df617b45eb74f311e3c259a46c237dc9bcbb5dd29ab98334fa5174d24b66c1e5d1545dc235e45a5
7
- data.tar.gz: 0e9bacffd8e373d1e05c7082d90fbbdb7f0c662e8b49eeef00ff4a96f33a88316955036c6e366aaf70ea4b1ac7af830d04f955133723ceefcd2794a64d9a7065
6
+ metadata.gz: 830a1b44dfc123d1ea276ef70deecd5f80276dd9900f43bc6d6c3d7024f87387b0b2acedd26197b028ba8d83509438bb354366bd555219fcf97aa5cf4725c04a
7
+ data.tar.gz: 1147ca7c0ec8a6dc588f68535d4fcd0d59f0d41d6844b87197a7e07a6df7a5a3b716a77be82506c4a3dd35b7b0c5e0443d9c4ac759f27d6644a0d16bbdc9c113
data/config/defaults.yml CHANGED
@@ -7,3 +7,4 @@ replacements:
7
7
  - open data: Taylor Swift
8
8
  - opendata: TaylorSwift
9
9
  interval: 60 #seconds
10
+ similarity_weighting: 4 # how many words a tweet must overlap already-seen tweets by for us to consider it 'similar'
@@ -3,13 +3,37 @@ module ReplacerBot
3
3
  def self.validate tweet
4
4
  archive = retrieve
5
5
  t = sanitise tweet
6
- valid = not(archive.include? t)
6
+ valid = not(archive.include? t) && not(similar_to_archive tweet, archive)
7
7
  archive.add t
8
8
  save archive
9
9
 
10
10
  valid
11
11
  end
12
12
 
13
+ def self.similar_to_archive tweet, archive
14
+ match = false
15
+
16
+ archive.each do |archived_tweet|
17
+ match = true if similar(tweet, archived_tweet)
18
+ end
19
+
20
+ match
21
+ end
22
+
23
+ def self.similar tweet, other_tweet, weighting: Config.instance.config.similarity_weighting
24
+ tweet_words = tweet.split ' '
25
+ return false if tweet_words.count < weighting
26
+
27
+ match = false
28
+
29
+ (tweet_words.count - (weighting - 1)).times do |i|
30
+ sample = tweet_words[i, weighting].join(' ').downcase
31
+ match = true if sanitise(other_tweet.downcase).index sanitise(sample)
32
+ end
33
+
34
+ match
35
+ end
36
+
13
37
  def self.retrieve
14
38
  begin
15
39
  Marshal.load File.open Config.instance.config.seen_tweets
@@ -1,3 +1,3 @@
1
1
  module ReplacerBot
2
- VERSION = "0.0.7"
2
+ VERSION = "0.0.8"
3
3
  end
@@ -60,7 +60,7 @@ module ReplacerBot
60
60
 
61
61
  it 'filters similar tweets', :vcr do
62
62
  SeenTweets.validate 'How open data can help save lives http://t.co/90U7bVq5UF'
63
- expect(replacer.tweets.count).to eq 19
63
+ expect(replacer.tweets.count).to eq 16
64
64
  end
65
65
  end
66
66
 
@@ -75,7 +75,7 @@ module ReplacerBot
75
75
  end
76
76
 
77
77
  it 'actually sends tweets', :vcr do
78
- expect(replacer.client).to(receive(:update)).exactly(18).times
78
+ expect(replacer.client).to(receive(:update)).exactly(16).times
79
79
  interval = replacer.config.interval
80
80
  replacer.config.interval = 0
81
81
  replacer.tweet
@@ -84,6 +84,30 @@ module ReplacerBot
84
84
  ]
85
85
  end
86
86
 
87
+ context 'overlap of words' do
88
+ # n is set in the default config, a lower value makes the bot less noisy at the risk of false negatives
89
+ it 'does not match on tweets with fewer than n words' do
90
+ expect(described_class.similar 'appears to match', 'You would think this appears to match').to eq false
91
+ end
92
+
93
+ it 'sees tweets which overlap by at least n words as similar' do
94
+ expect(described_class.similar 'This is a string of words', 'Also this is a string of words innit').to eq true
95
+ expect(described_class.similar 'This is a string of words', 'Also this is a similar string similar words innit').to eq false
96
+ expect(described_class.similar 'This one will be a definite match ', 'So this one will be a definite match no doubt').to eq true
97
+ end
98
+
99
+ it 'deals sensibly with URLs and hashtags' do
100
+ expect(described_class.similar 'This one has a http://taylor.swift in it', 'So this one has a http://other.url/ in it here').to eq true
101
+ end
102
+
103
+ it 'works on real-world data' do
104
+ expect(described_class.
105
+ similar 'Netflix Releases Taylor Swift-Fetching Developer Preview: Netflix has released a developer preview of its in-house… bit.ly/1JfRdgA',
106
+ 'Netflix Releases Taylor Swift-Fetching Developer Preview - Netflix has released a developer preview of its in-house d...'
107
+ ).to eq true
108
+ end
109
+ end
110
+
87
111
  it 'saves a set' do
88
112
  set = Set.new [1, 2, 3]
89
113
  described_class.save set
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: replacer_bot
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.7
4
+ version: 0.0.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - pikesley