birdwatcher 0.3.1 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -1
- data/lib/birdwatcher.rb +1 -0
- data/lib/birdwatcher/commands/resource.rb +1 -1
- data/lib/birdwatcher/commands/spool.rb +76 -0
- data/lib/birdwatcher/concerns/outputting.rb +1 -1
- data/lib/birdwatcher/concerns/presentation.rb +1 -2
- data/lib/birdwatcher/concerns/word_list.rb +21 -0
- data/lib/birdwatcher/console.rb +72 -5
- data/lib/birdwatcher/module.rb +1 -0
- data/lib/birdwatcher/modules/statuses/word_cloud.rb +19 -52
- data/lib/birdwatcher/modules/statuses/word_list.rb +145 -0
- data/lib/birdwatcher/version.rb +1 -1
- data/lib/birdwatcher/word_list.rb +66 -0
- metadata +6 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9fe692202c5f66cd4792434688658820d13f659f
|
4
|
+
data.tar.gz: afd0ce664e68cfe1228b46110ebedcfec139f260
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 25e84e1841967733afe7fd0a5967c2637da26bb0f9b540eeb0be1e219983fbb6e555216b4a111b0eaca7816023bc7bee675676772f5814c883e79c56e83521e0
|
7
|
+
data.tar.gz: 1edea6733ed598f18e649397e48b483dedff510ca686d8f94c26b26e6c23d0b7783fc532939072418608bec31778f868ffe0d124ed465bc17acf1e98355c8ac7
|
data/CHANGELOG.md
CHANGED
@@ -3,6 +3,18 @@ All notable changes to this project will be documented in this file.
|
|
3
3
|
This project adheres to [Semantic Versioning](http://semver.org/).
|
4
4
|
|
5
5
|
## [Unreleased]
|
6
|
+
### Added
|
7
|
+
|
8
|
+
## [0.4.0]
|
9
|
+
### Added
|
10
|
+
- New `spool` command to spool console input and output to a file
|
11
|
+
- Write command history to `~/.birdwatcher_history` and load history on startup
|
12
|
+
for persistent command history
|
13
|
+
- New module `statuses/word_list` to generate a simple word list from statuses
|
14
|
+
- Introduced new `Birdwatcher::WordList` class to process text into a word list
|
15
|
+
|
16
|
+
### Changed
|
17
|
+
- Refactored `statuses/word_cloud` module to use new `Birdwatcher::WordList` class
|
6
18
|
|
7
19
|
## [0.3.1]
|
8
20
|
### Added
|
@@ -14,4 +26,4 @@ This project adheres to [Semantic Versioning](http://semver.org/).
|
|
14
26
|
- `posted_at` column added to `urls` for better and easier ordering
|
15
27
|
|
16
28
|
### Fixed
|
17
|
-
-
|
29
|
+
- Make `status search` command case insensitive
|
data/lib/birdwatcher.rb
CHANGED
@@ -19,6 +19,7 @@ require "birdwatcher/http_client"
|
|
19
19
|
require "birdwatcher/klout_client"
|
20
20
|
require "birdwatcher/punchcard"
|
21
21
|
require "birdwatcher/kml"
|
22
|
+
require "birdwatcher/word_list"
|
22
23
|
require "birdwatcher/console"
|
23
24
|
require "birdwatcher/configuration"
|
24
25
|
require "birdwatcher/configuration_wizard"
|
@@ -0,0 +1,76 @@
|
|
1
|
+
module Birdwatcher
|
2
|
+
module Commands
|
3
|
+
class Spool < Birdwatcher::Command
|
4
|
+
self.meta = {
|
5
|
+
:description => "Write console output into a file as well the screen",
|
6
|
+
:names => %w(spool),
|
7
|
+
:usage => "spool FILE|off"
|
8
|
+
}
|
9
|
+
|
10
|
+
def self.detailed_usage
|
11
|
+
<<-USAGE
|
12
|
+
The #{'spool'.bold} command can be used to write all console output into a file
|
13
|
+
as well the screen. The output will be appended to the file if it already exists.
|
14
|
+
|
15
|
+
#{'USAGE:'.bold}
|
16
|
+
|
17
|
+
#{'Spool output to a file:'.bold}
|
18
|
+
spool FILE
|
19
|
+
|
20
|
+
#{'Turn off spooling:'.bold}
|
21
|
+
spool off
|
22
|
+
|
23
|
+
#{'See status of spooling:'.bold}
|
24
|
+
spool status
|
25
|
+
USAGE
|
26
|
+
end
|
27
|
+
|
28
|
+
def run
|
29
|
+
if !arguments?
|
30
|
+
error("You must provide a path to a file or an action")
|
31
|
+
return false
|
32
|
+
end
|
33
|
+
action = arguments.first.downcase
|
34
|
+
case action
|
35
|
+
when "start"
|
36
|
+
start_spooling
|
37
|
+
when "off", "stop"
|
38
|
+
stop_spooling
|
39
|
+
when "status"
|
40
|
+
status_spooling
|
41
|
+
else
|
42
|
+
start_spooling
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
private
|
47
|
+
|
48
|
+
def start_spooling
|
49
|
+
if arguments.first.downcase == "start"
|
50
|
+
file = arguments[1, -1].join(" ")
|
51
|
+
else
|
52
|
+
file = arguments.join(" ")
|
53
|
+
end
|
54
|
+
if file.empty?
|
55
|
+
error("You must provide a path to a file")
|
56
|
+
return false
|
57
|
+
end
|
58
|
+
console.spool = File.open(file, "a").tap { |f| f.sync = true }
|
59
|
+
info("Spooling output to #{file.bold}")
|
60
|
+
end
|
61
|
+
|
62
|
+
def stop_spooling
|
63
|
+
console.spool = nil
|
64
|
+
info("Output spooling stopped")
|
65
|
+
end
|
66
|
+
|
67
|
+
def status_spooling
|
68
|
+
if console.spool && console.spool.is_a?(File)
|
69
|
+
info("Spooling output to #{console.spool.path.bold}")
|
70
|
+
else
|
71
|
+
info("Output spooling is stopped")
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
@@ -114,8 +114,7 @@ module Birdwatcher
|
|
114
114
|
# If the text is long, it will be automatically paged with the system's
|
115
115
|
# currently configured pager command (usually `less`).
|
116
116
|
def page_text(text)
|
117
|
-
::
|
118
|
-
rescue Errno::EPIPE
|
117
|
+
Birdwatcher::Console.instance.page_text(text)
|
119
118
|
end
|
120
119
|
end
|
121
120
|
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module Birdwatcher
|
2
|
+
module Concerns
|
3
|
+
module WordList
|
4
|
+
def self.included(base)
|
5
|
+
base.extend(ClassMethods)
|
6
|
+
end
|
7
|
+
|
8
|
+
module ClassMethods
|
9
|
+
end
|
10
|
+
|
11
|
+
# Get a new word list instance
|
12
|
+
#
|
13
|
+
# @param options [Hash] Word list options
|
14
|
+
#
|
15
|
+
# @return [Birdwatcher::WordList]
|
16
|
+
def make_word_list(options = {})
|
17
|
+
Birdwatcher::WordList.new(options)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
data/lib/birdwatcher/console.rb
CHANGED
@@ -3,14 +3,17 @@ module Birdwatcher
|
|
3
3
|
include Singleton
|
4
4
|
|
5
5
|
DEFAULT_AUTO_COMPLETION_STRINGS = [].freeze
|
6
|
-
DB_MIGRATIONS_PATH
|
7
|
-
LINE_SEPARATOR
|
6
|
+
DB_MIGRATIONS_PATH = File.expand_path("../../../db/migrations", __FILE__).freeze
|
7
|
+
LINE_SEPARATOR = ("=" * 80).freeze
|
8
|
+
HISTORY_FILE_NAME = ".birdwatcher_history".freeze
|
9
|
+
HISTORY_FILE_LOCATION = File.join(Dir.home, HISTORY_FILE_NAME).freeze
|
8
10
|
|
9
|
-
attr_accessor :current_workspace, :current_module
|
11
|
+
attr_accessor :current_workspace, :current_module, :spool
|
10
12
|
attr_reader :database
|
11
13
|
|
12
14
|
def initialize
|
13
15
|
@output_mutex = Mutex.new
|
16
|
+
@spool_mutex = Mutex.new
|
14
17
|
end
|
15
18
|
|
16
19
|
def start!
|
@@ -21,7 +24,9 @@ module Birdwatcher
|
|
21
24
|
Birdwatcher::Console.instance.auto_completion_strings.grep(/\A#{Regexp.escape(s)}/) + Dir["#{expanded_s}*"].grep(/^#{Regexp.escape(expanded_s)}/)
|
22
25
|
end
|
23
26
|
Readline.completion_append_character = ""
|
27
|
+
load_command_history
|
24
28
|
while input = Readline.readline(prompt_line, true)
|
29
|
+
save_to_spool(prompt_line)
|
25
30
|
input = input.to_s.strip
|
26
31
|
handle_input(input) unless input.empty?
|
27
32
|
end
|
@@ -29,6 +34,8 @@ module Birdwatcher
|
|
29
34
|
|
30
35
|
def handle_input(input)
|
31
36
|
input.strip!
|
37
|
+
save_command_to_history(input)
|
38
|
+
save_to_spool("#{input}\n")
|
32
39
|
command_name, argument_line = input.split(" ", 2).map(&:strip)
|
33
40
|
command_name.downcase
|
34
41
|
commands.each do |command|
|
@@ -52,14 +59,15 @@ module Birdwatcher
|
|
52
59
|
def output(data, newline = true)
|
53
60
|
data = "#{data}\n" if newline
|
54
61
|
with_output_mutex { print data }
|
62
|
+
save_to_spool(data)
|
55
63
|
end
|
56
64
|
|
57
65
|
def output_formatted(*args)
|
58
|
-
|
66
|
+
output(sprintf(*args), false)
|
59
67
|
end
|
60
68
|
|
61
69
|
def newline
|
62
|
-
|
70
|
+
output ""
|
63
71
|
end
|
64
72
|
|
65
73
|
def line_separator
|
@@ -77,6 +85,7 @@ module Birdwatcher
|
|
77
85
|
rescue => e
|
78
86
|
output " failed".bold.light_red
|
79
87
|
error "#{e.class}: ".bold + e.message
|
88
|
+
e.backtrace.each { |l| error l } if debugging_enabled?
|
80
89
|
exit(1) if fatal
|
81
90
|
end
|
82
91
|
|
@@ -92,6 +101,24 @@ module Birdwatcher
|
|
92
101
|
output "[-]".white.bold.on_red + " #{message}"
|
93
102
|
end
|
94
103
|
|
104
|
+
def confirm(question)
|
105
|
+
question = "#{question} (y/n) "
|
106
|
+
save_to_spool(question)
|
107
|
+
if HighLine.agree("#{question}")
|
108
|
+
save_to_spool("y\n")
|
109
|
+
true
|
110
|
+
else
|
111
|
+
save_to_spool("n\n")
|
112
|
+
false
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
def page_text(text)
|
117
|
+
save_to_spool(text)
|
118
|
+
::TTY::Pager::SystemPager.new.page(text)
|
119
|
+
rescue Errno::EPIPE
|
120
|
+
end
|
121
|
+
|
95
122
|
def twitter_client
|
96
123
|
if !@twitter_clients
|
97
124
|
@twitter_clients = create_twitter_clients!
|
@@ -179,6 +206,10 @@ module Birdwatcher
|
|
179
206
|
@output_mutex.synchronize { yield }
|
180
207
|
end
|
181
208
|
|
209
|
+
def with_spool_mutex
|
210
|
+
@spool_mutex.synchronize { yield }
|
211
|
+
end
|
212
|
+
|
182
213
|
def create_twitter_clients!
|
183
214
|
clients = []
|
184
215
|
configuration.get!(:twitter).each do |keypair|
|
@@ -197,5 +228,41 @@ module Birdwatcher
|
|
197
228
|
end
|
198
229
|
clients
|
199
230
|
end
|
231
|
+
|
232
|
+
def load_command_history
|
233
|
+
if File.exist?(HISTORY_FILE_LOCATION)
|
234
|
+
if File.readable?(HISTORY_FILE_LOCATION)
|
235
|
+
File.open(HISTORY_FILE_LOCATION).each_line do |command|
|
236
|
+
Readline::HISTORY << command.strip
|
237
|
+
end
|
238
|
+
else
|
239
|
+
warn("Cannot load command history: #{HISTORY_FILE_LOCATION} is not readable")
|
240
|
+
end
|
241
|
+
end
|
242
|
+
end
|
243
|
+
|
244
|
+
def save_command_to_history(command)
|
245
|
+
if File.exist?(HISTORY_FILE_LOCATION) && !File.writable?(HISTORY_FILE_LOCATION)
|
246
|
+
warn("Cannot save command to history: #{HISTORY_FILE_LOCATION} is not writable")
|
247
|
+
return
|
248
|
+
end
|
249
|
+
File.open(HISTORY_FILE_LOCATION, "a") do |file|
|
250
|
+
file.puts(command)
|
251
|
+
end
|
252
|
+
end
|
253
|
+
|
254
|
+
def save_to_spool(string)
|
255
|
+
return unless spool_enabled?
|
256
|
+
string = string.to_s.uncolorize
|
257
|
+
with_spool_mutex { self.spool.write(string) }
|
258
|
+
end
|
259
|
+
|
260
|
+
def spool_enabled?
|
261
|
+
self.spool && self.spool.is_a?(File)
|
262
|
+
end
|
263
|
+
|
264
|
+
def debugging_enabled?
|
265
|
+
ENV.key?("BIRDWATCHER_DEBUG")
|
266
|
+
end
|
200
267
|
end
|
201
268
|
end
|
data/lib/birdwatcher/module.rb
CHANGED
@@ -95,8 +95,6 @@ module Birdwatcher
|
|
95
95
|
}
|
96
96
|
}
|
97
97
|
|
98
|
-
DEFAULT_EXCLUDED_WORDS = %w(rt via oh)
|
99
|
-
|
100
98
|
def self.info
|
101
99
|
<<-INFO
|
102
100
|
The Word Cloud module can generate a classic weighted word cloud from words used
|
@@ -133,37 +131,34 @@ INFO
|
|
133
131
|
error("There are no statuses to process")
|
134
132
|
return false
|
135
133
|
end
|
136
|
-
|
137
|
-
|
138
|
-
|
134
|
+
word_list = make_word_list(
|
135
|
+
:min_word_count => option_setting("MIN_WORD_COUNT"),
|
136
|
+
:min_word_length => option_setting("MIN_WORD_LENGTH"),
|
137
|
+
:exclude_words => option_setting("EXCLUDE_WORDS").to_s.split(" ").map(&:strip),
|
138
|
+
:exclude_stopwords => option_setting("EXCLUDE_STOPWORDS"),
|
139
|
+
:exclude_common_words => option_setting("EXCLUDE_COMMON"),
|
140
|
+
:exclude_hashtags => option_setting("EXCLUDE_HASHTAGS"),
|
141
|
+
:exclude_mentions => option_setting("EXCLUDE_MENTIONS"),
|
142
|
+
:word_cap => option_setting("WORD_CAP"),
|
143
|
+
:stopwords_file => File.join(DATA_DIRECTORY, "english_stopwords.txt"),
|
144
|
+
:common_words_file => File.join(DATA_DIRECTORY, "top100Kenglishwords.txt")
|
145
|
+
)
|
139
146
|
task("Processing #{statuses.count.to_s.bold} statuses...") do
|
140
147
|
statuses.each do |status|
|
141
|
-
|
142
|
-
next if exclude_word?(word)
|
143
|
-
words.key?(word) ? words[word] += 1 : words[word] = 1
|
144
|
-
end
|
148
|
+
word_list.add_to_corpus(status.text)
|
145
149
|
if option_setting("INCLUDE_PAGE_TITLES")
|
146
150
|
status.urls_dataset
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
next if exclude_word?(word)
|
152
|
-
words.key?(word) ? words[word] += 1 : words[word] = 1
|
153
|
-
end
|
151
|
+
.where("title IS NOT NULL")
|
152
|
+
.where("final_url NOT LIKE 'https://twitter.com/%'")
|
153
|
+
.map(&:title).each do |page_title|
|
154
|
+
word_list.add_to_corpus(page_title)
|
154
155
|
end
|
155
156
|
end
|
156
157
|
end
|
157
|
-
|
158
|
-
words.delete_if { |word, count| count < option_setting("MIN_WORD_COUNT").to_i }
|
159
|
-
end
|
160
|
-
sorted_words = words.sort_by { |word, count| count}.reverse
|
161
|
-
if option_setting("WORD_CAP")
|
162
|
-
sorted_words = sorted_words.take(option_setting("WORD_CAP").to_i)
|
163
|
-
end
|
158
|
+
word_list.process
|
164
159
|
end
|
165
160
|
task("Generating word cloud, patience please...") do
|
166
|
-
cloud = MagicCloud::Cloud.new(
|
161
|
+
cloud = MagicCloud::Cloud.new(word_list.word_list,
|
167
162
|
:rotate => :none,
|
168
163
|
:palette => option_setting("PALETTE").split(" ").map(&:strip)
|
169
164
|
).draw(option_setting("IMAGE_WIDTH").to_i, option_setting("IMAGE_HEIGHT").to_i).to_blob { self.format = "png" }
|
@@ -171,34 +166,6 @@ INFO
|
|
171
166
|
end
|
172
167
|
info("Word cloud written to #{option_setting('DEST').bold}")
|
173
168
|
end
|
174
|
-
|
175
|
-
private
|
176
|
-
|
177
|
-
def prepare_exclusion_list
|
178
|
-
@exclusion_list = DEFAULT_EXCLUDED_WORDS
|
179
|
-
if option_setting("EXCLUDE_WORDS")
|
180
|
-
@exclusion_list += option_setting("EXCLUDE_WORDS").split(" ").map { |w| w.strip.downcase }
|
181
|
-
end
|
182
|
-
if option_setting("EXCLUDE_STOPWORDS")
|
183
|
-
@exclusion_list += read_data_file("english_stopwords.txt").split("\n").map { |w| w.strip.downcase }
|
184
|
-
end
|
185
|
-
if option_setting("EXCLUDE_COMMON")
|
186
|
-
@exclusion_list += read_data_file("top100Kenglishwords.txt").split("\n").map(&:strip)
|
187
|
-
end
|
188
|
-
end
|
189
|
-
|
190
|
-
def exclude_word?(word)
|
191
|
-
return true if word.empty?
|
192
|
-
return true if option_setting("MIN_WORD_LENGTH") && word.length < option_setting("MIN_WORD_LENGTH").to_i
|
193
|
-
return true if option_setting("EXCLUDE_HASHTAGS") && word.start_with?("#")
|
194
|
-
return true if option_setting("EXCLUDE_MENTIONS") && word.start_with?("@")
|
195
|
-
return true if @exclusion_list.include?(word)
|
196
|
-
end
|
197
|
-
|
198
|
-
def split_into_words(text)
|
199
|
-
text = text.downcase.strip.gsub(/https?:\/\/[\S]+/, "").gsub(/[^0-9a-z@#_ ]/i, " ")
|
200
|
-
text.split(" ").map(&:strip)
|
201
|
-
end
|
202
169
|
end
|
203
170
|
end
|
204
171
|
end
|
@@ -0,0 +1,145 @@
|
|
1
|
+
module Birdwatcher
|
2
|
+
module Modules
|
3
|
+
module Statuses
|
4
|
+
class Wordlist < Birdwatcher::Module
|
5
|
+
self.meta = {
|
6
|
+
:name => "Word List",
|
7
|
+
:description => "Generates a word list from statuses",
|
8
|
+
:author => "Michael Henriksen <michenriksen@neomailbox.ch>",
|
9
|
+
:options => {
|
10
|
+
"DEST" => {
|
11
|
+
:value => nil,
|
12
|
+
:description => "Destination file",
|
13
|
+
:required => true
|
14
|
+
},
|
15
|
+
"USERS" => {
|
16
|
+
:value => nil,
|
17
|
+
:description => "Space-separated list of screen names (all users if empty)",
|
18
|
+
:required => false
|
19
|
+
},
|
20
|
+
"MIN_WORD_COUNT" => {
|
21
|
+
:value => 3,
|
22
|
+
:description => "Exclude words mentioned fewer times than specified",
|
23
|
+
:required => false
|
24
|
+
},
|
25
|
+
"MIN_WORD_LENGTH" => {
|
26
|
+
:value => 6,
|
27
|
+
:description => "Exclude words smaller than specified",
|
28
|
+
:required => false
|
29
|
+
},
|
30
|
+
"EXCLUDE_STOPWORDS" => {
|
31
|
+
:value => true,
|
32
|
+
:description => "Exclude english stopwords",
|
33
|
+
:required => false,
|
34
|
+
:boolean => true
|
35
|
+
},
|
36
|
+
"EXCLUDE_COMMON" => {
|
37
|
+
:value => true,
|
38
|
+
:description => "Exclude common english words",
|
39
|
+
:required => false,
|
40
|
+
:boolean => true
|
41
|
+
},
|
42
|
+
"EXCLUDE_WORDS" => {
|
43
|
+
:value => nil,
|
44
|
+
:description => "Space-separated list of words to exclude",
|
45
|
+
:required => false
|
46
|
+
},
|
47
|
+
"EXCLUDE_HASHTAGS" => {
|
48
|
+
:value => true,
|
49
|
+
:description => "Exclude Hashtags",
|
50
|
+
:required => false,
|
51
|
+
:boolean => true
|
52
|
+
},
|
53
|
+
"EXCLUDE_MENTIONS" => {
|
54
|
+
:value => true,
|
55
|
+
:description => "Exclude @username mentions",
|
56
|
+
:required => false,
|
57
|
+
:boolean => true
|
58
|
+
},
|
59
|
+
"INCLUDE_PAGE_TITLES" => {
|
60
|
+
:value => false,
|
61
|
+
:description => "Include web page titles from shared URLs (requires crawling with urls/crawl)",
|
62
|
+
:required => false,
|
63
|
+
:boolean => true
|
64
|
+
},
|
65
|
+
"WORD_CAP" => {
|
66
|
+
:value => nil,
|
67
|
+
:description => "Cap list of words to specified amount",
|
68
|
+
:required => false
|
69
|
+
},
|
70
|
+
"INCLUDE_COUNT" => {
|
71
|
+
:value => false,
|
72
|
+
:description => "Include the count with the words",
|
73
|
+
:required => false,
|
74
|
+
:boolean => true
|
75
|
+
}
|
76
|
+
}
|
77
|
+
}
|
78
|
+
|
79
|
+
def self.info
|
80
|
+
<<-INFO
|
81
|
+
The Word List module can generate a simple word list or dictionary from words
|
82
|
+
used in statuses across all or specific users.
|
83
|
+
|
84
|
+
Since users Tweet about their hobbies, interests, work, etc. generating a word
|
85
|
+
list from statuses can be very effective for password cracking.
|
86
|
+
INFO
|
87
|
+
end
|
88
|
+
|
89
|
+
def run
|
90
|
+
if option_setting("USERS")
|
91
|
+
screen_names = option_setting("USERS").split(" ").map(&:strip)
|
92
|
+
user_ids = current_workspace.users_dataset.where("screen_name IN ?", screen_names).map(&:id)
|
93
|
+
statuses = current_workspace.statuses_dataset.where("user_id IN ?", user_ids)
|
94
|
+
else
|
95
|
+
statuses = current_workspace.statuses_dataset
|
96
|
+
end
|
97
|
+
if statuses.count.zero?
|
98
|
+
error("There are no statuses to process")
|
99
|
+
return false
|
100
|
+
end
|
101
|
+
word_list = make_word_list(
|
102
|
+
:min_word_count => option_setting("MIN_WORD_COUNT"),
|
103
|
+
:min_word_length => option_setting("MIN_WORD_LENGTH"),
|
104
|
+
:exclude_words => option_setting("EXCLUDE_WORDS").to_s.split(" ").map(&:strip),
|
105
|
+
:exclude_stopwords => option_setting("EXCLUDE_STOPWORDS"),
|
106
|
+
:exclude_common_words => option_setting("EXCLUDE_COMMON"),
|
107
|
+
:exclude_hashtags => option_setting("EXCLUDE_HASHTAGS"),
|
108
|
+
:exclude_mentions => option_setting("EXCLUDE_MENTIONS"),
|
109
|
+
:word_cap => option_setting("WORD_CAP"),
|
110
|
+
:stopwords_file => File.join(DATA_DIRECTORY, "english_stopwords.txt"),
|
111
|
+
:common_words_file => File.join(DATA_DIRECTORY, "top100Kenglishwords.txt")
|
112
|
+
)
|
113
|
+
task("Processing #{statuses.count.to_s.bold} statuses...") do
|
114
|
+
statuses.each do |status|
|
115
|
+
word_list.add_to_corpus(status.text)
|
116
|
+
if option_setting("INCLUDE_PAGE_TITLES")
|
117
|
+
status.urls_dataset
|
118
|
+
.where("title IS NOT NULL")
|
119
|
+
.where("final_url NOT LIKE 'https://twitter.com/%'")
|
120
|
+
.map(&:title).each do |page_title|
|
121
|
+
word_list.add_to_corpus(page_title)
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
125
|
+
word_list.process
|
126
|
+
end
|
127
|
+
task("Writing #{pluralize(word_list.word_list.length, 'word', 'words')} to file...") do
|
128
|
+
File.open(option_setting("DEST"), "w") do |f|
|
129
|
+
word_list.word_list.each do |word_and_count|
|
130
|
+
word, count = word_and_count
|
131
|
+
if option_setting("INCLUDE_COUNT")
|
132
|
+
f.puts("#{word}, #{count}")
|
133
|
+
else
|
134
|
+
f.puts(word)
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|
138
|
+
end
|
139
|
+
file_size = number_to_human_size(File.size(option_setting("DEST")))
|
140
|
+
info("Wrote #{file_size.bold} to #{option_setting('DEST').bold}")
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
145
|
+
end
|
data/lib/birdwatcher/version.rb
CHANGED
@@ -0,0 +1,66 @@
|
|
1
|
+
module Birdwatcher
|
2
|
+
class WordList
|
3
|
+
attr_reader :options, :corpus, :word_list
|
4
|
+
|
5
|
+
def initialize(options)
|
6
|
+
@options = options
|
7
|
+
@corpus = []
|
8
|
+
@word_list = {}
|
9
|
+
end
|
10
|
+
|
11
|
+
def add_to_corpus(text)
|
12
|
+
@corpus << text.to_s
|
13
|
+
end
|
14
|
+
|
15
|
+
def process
|
16
|
+
words = {}
|
17
|
+
corpus.each do |text|
|
18
|
+
normalize_and_split(text).each do |word|
|
19
|
+
next if exclude_word?(word)
|
20
|
+
words.key?(word) ? words[word] += 1 : words[word] = 1
|
21
|
+
end
|
22
|
+
end
|
23
|
+
if options[:min_word_count]
|
24
|
+
words.delete_if { |word, count| count < options[:min_word_count].to_i }
|
25
|
+
end
|
26
|
+
sorted_words = words.sort_by { |word, count| count }.reverse
|
27
|
+
if options[:word_cap]
|
28
|
+
sorted_words = sorted_words.take(options[:word_cap].to_i)
|
29
|
+
end
|
30
|
+
@word_list = sorted_words
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
|
35
|
+
def exclusion_list
|
36
|
+
if !@exclusion_list
|
37
|
+
@exclusion_list = options[:exclude_words] || []
|
38
|
+
if options[:stopwords_file] && options[:exclude_stopwords]
|
39
|
+
@exclusion_list += File.read(options[:stopwords_file]).split("\n").map do |w|
|
40
|
+
w.strip.downcase
|
41
|
+
end
|
42
|
+
end
|
43
|
+
if options[:common_words_file] && options[:exclude_common_words]
|
44
|
+
@exclusion_list += File.read(options[:common_words_file]).split("\n").map do |w|
|
45
|
+
w.strip.downcase
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
@exclusion_list
|
50
|
+
end
|
51
|
+
|
52
|
+
def normalize_and_split(text)
|
53
|
+
text = text.downcase.strip.gsub(/https?:\/\/[\S]+/, "").gsub(/[^0-9a-z@#_ ]/i, " ")
|
54
|
+
text.split(" ").map(&:strip)
|
55
|
+
end
|
56
|
+
|
57
|
+
def exclude_word?(word)
|
58
|
+
return true if word.empty?
|
59
|
+
return true if options[:min_word_length] && word.length < options[:min_word_length]
|
60
|
+
return true if options[:exclude_hashtags] && word.start_with?("#")
|
61
|
+
return true if options[:exclude_mentions] && word.start_with?("@")
|
62
|
+
return true if exclusion_list.include?(word)
|
63
|
+
false
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: birdwatcher
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Michael Henrikesn
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-11-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: sequel
|
@@ -337,6 +337,7 @@ files:
|
|
337
337
|
- lib/birdwatcher/commands/set.rb
|
338
338
|
- lib/birdwatcher/commands/shell.rb
|
339
339
|
- lib/birdwatcher/commands/show.rb
|
340
|
+
- lib/birdwatcher/commands/spool.rb
|
340
341
|
- lib/birdwatcher/commands/status.rb
|
341
342
|
- lib/birdwatcher/commands/unset.rb
|
342
343
|
- lib/birdwatcher/commands/use.rb
|
@@ -348,6 +349,7 @@ files:
|
|
348
349
|
- lib/birdwatcher/concerns/persistence.rb
|
349
350
|
- lib/birdwatcher/concerns/presentation.rb
|
350
351
|
- lib/birdwatcher/concerns/util.rb
|
352
|
+
- lib/birdwatcher/concerns/word_list.rb
|
351
353
|
- lib/birdwatcher/configuration.rb
|
352
354
|
- lib/birdwatcher/configuration_wizard.rb
|
353
355
|
- lib/birdwatcher/console.rb
|
@@ -360,6 +362,7 @@ files:
|
|
360
362
|
- lib/birdwatcher/modules/statuses/kml.rb
|
361
363
|
- lib/birdwatcher/modules/statuses/sentiment.rb
|
362
364
|
- lib/birdwatcher/modules/statuses/word_cloud.rb
|
365
|
+
- lib/birdwatcher/modules/statuses/word_list.rb
|
363
366
|
- lib/birdwatcher/modules/urls/crawl.rb
|
364
367
|
- lib/birdwatcher/modules/urls/most_shared.rb
|
365
368
|
- lib/birdwatcher/modules/users/activity_plot.rb
|
@@ -373,6 +376,7 @@ files:
|
|
373
376
|
- lib/birdwatcher/punchcard.rb
|
374
377
|
- lib/birdwatcher/util.rb
|
375
378
|
- lib/birdwatcher/version.rb
|
379
|
+
- lib/birdwatcher/word_list.rb
|
376
380
|
- models/hashtag.rb
|
377
381
|
- models/influencee.rb
|
378
382
|
- models/influencer.rb
|