birdwatcher 0.3.1 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -1
- data/lib/birdwatcher.rb +1 -0
- data/lib/birdwatcher/commands/resource.rb +1 -1
- data/lib/birdwatcher/commands/spool.rb +76 -0
- data/lib/birdwatcher/concerns/outputting.rb +1 -1
- data/lib/birdwatcher/concerns/presentation.rb +1 -2
- data/lib/birdwatcher/concerns/word_list.rb +21 -0
- data/lib/birdwatcher/console.rb +72 -5
- data/lib/birdwatcher/module.rb +1 -0
- data/lib/birdwatcher/modules/statuses/word_cloud.rb +19 -52
- data/lib/birdwatcher/modules/statuses/word_list.rb +145 -0
- data/lib/birdwatcher/version.rb +1 -1
- data/lib/birdwatcher/word_list.rb +66 -0
- metadata +6 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9fe692202c5f66cd4792434688658820d13f659f
|
4
|
+
data.tar.gz: afd0ce664e68cfe1228b46110ebedcfec139f260
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 25e84e1841967733afe7fd0a5967c2637da26bb0f9b540eeb0be1e219983fbb6e555216b4a111b0eaca7816023bc7bee675676772f5814c883e79c56e83521e0
|
7
|
+
data.tar.gz: 1edea6733ed598f18e649397e48b483dedff510ca686d8f94c26b26e6c23d0b7783fc532939072418608bec31778f868ffe0d124ed465bc17acf1e98355c8ac7
|
data/CHANGELOG.md
CHANGED
@@ -3,6 +3,18 @@ All notable changes to this project will be documented in this file.
|
|
3
3
|
This project adheres to [Semantic Versioning](http://semver.org/).
|
4
4
|
|
5
5
|
## [Unreleased]
|
6
|
+
### Added
|
7
|
+
|
8
|
+
## [0.4.0]
|
9
|
+
### Added
|
10
|
+
- New `spool` command to spool console input and output to a file
|
11
|
+
- Write command history to `~/.birdwatcher_history` and load history on startup
|
12
|
+
for persistent command history
|
13
|
+
- New module `statuses/word_list` to generate a simple word list from statuses
|
14
|
+
- Introduced new `Birdwatcher::WordList` class to process text into a word list
|
15
|
+
|
16
|
+
### Changed
|
17
|
+
- Refactored `statuses/word_cloud` module to use new `Birdwatcher::WordList` class
|
6
18
|
|
7
19
|
## [0.3.1]
|
8
20
|
### Added
|
@@ -14,4 +26,4 @@ This project adheres to [Semantic Versioning](http://semver.org/).
|
|
14
26
|
- `posted_at` column added to `urls` for better and easier ordering
|
15
27
|
|
16
28
|
### Fixed
|
17
|
-
-
|
29
|
+
- Make `status search` command case insensitive
|
data/lib/birdwatcher.rb
CHANGED
@@ -19,6 +19,7 @@ require "birdwatcher/http_client"
|
|
19
19
|
require "birdwatcher/klout_client"
|
20
20
|
require "birdwatcher/punchcard"
|
21
21
|
require "birdwatcher/kml"
|
22
|
+
require "birdwatcher/word_list"
|
22
23
|
require "birdwatcher/console"
|
23
24
|
require "birdwatcher/configuration"
|
24
25
|
require "birdwatcher/configuration_wizard"
|
@@ -0,0 +1,76 @@
|
|
1
|
+
module Birdwatcher
|
2
|
+
module Commands
|
3
|
+
class Spool < Birdwatcher::Command
|
4
|
+
self.meta = {
|
5
|
+
:description => "Write console output into a file as well the screen",
|
6
|
+
:names => %w(spool),
|
7
|
+
:usage => "spool FILE|off"
|
8
|
+
}
|
9
|
+
|
10
|
+
def self.detailed_usage
|
11
|
+
<<-USAGE
|
12
|
+
The #{'spool'.bold} command can be used to write all console output into a file
|
13
|
+
as well the screen. The output will be appended to the file if it already exists.
|
14
|
+
|
15
|
+
#{'USAGE:'.bold}
|
16
|
+
|
17
|
+
#{'Spool output to a file:'.bold}
|
18
|
+
spool FILE
|
19
|
+
|
20
|
+
#{'Turn off spooling:'.bold}
|
21
|
+
spool off
|
22
|
+
|
23
|
+
#{'See status of spooling:'.bold}
|
24
|
+
spool status
|
25
|
+
USAGE
|
26
|
+
end
|
27
|
+
|
28
|
+
def run
|
29
|
+
if !arguments?
|
30
|
+
error("You must provide a path to a file or an action")
|
31
|
+
return false
|
32
|
+
end
|
33
|
+
action = arguments.first.downcase
|
34
|
+
case action
|
35
|
+
when "start"
|
36
|
+
start_spooling
|
37
|
+
when "off", "stop"
|
38
|
+
stop_spooling
|
39
|
+
when "status"
|
40
|
+
status_spooling
|
41
|
+
else
|
42
|
+
start_spooling
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
private
|
47
|
+
|
48
|
+
def start_spooling
|
49
|
+
if arguments.first.downcase == "start"
|
50
|
+
file = arguments[1, -1].join(" ")
|
51
|
+
else
|
52
|
+
file = arguments.join(" ")
|
53
|
+
end
|
54
|
+
if file.empty?
|
55
|
+
error("You must provide a path to a file")
|
56
|
+
return false
|
57
|
+
end
|
58
|
+
console.spool = File.open(file, "a").tap { |f| f.sync = true }
|
59
|
+
info("Spooling output to #{file.bold}")
|
60
|
+
end
|
61
|
+
|
62
|
+
def stop_spooling
|
63
|
+
console.spool = nil
|
64
|
+
info("Output spooling stopped")
|
65
|
+
end
|
66
|
+
|
67
|
+
def status_spooling
|
68
|
+
if console.spool && console.spool.is_a?(File)
|
69
|
+
info("Spooling output to #{console.spool.path.bold}")
|
70
|
+
else
|
71
|
+
info("Output spooling is stopped")
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
@@ -114,8 +114,7 @@ module Birdwatcher
|
|
114
114
|
# If the text is long, it will be automatically paged with the system's
|
115
115
|
# currently configured pager command (usually `less`).
|
116
116
|
def page_text(text)
|
117
|
-
::
|
118
|
-
rescue Errno::EPIPE
|
117
|
+
Birdwatcher::Console.instance.page_text(text)
|
119
118
|
end
|
120
119
|
end
|
121
120
|
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module Birdwatcher
|
2
|
+
module Concerns
|
3
|
+
module WordList
|
4
|
+
def self.included(base)
|
5
|
+
base.extend(ClassMethods)
|
6
|
+
end
|
7
|
+
|
8
|
+
module ClassMethods
|
9
|
+
end
|
10
|
+
|
11
|
+
# Get a new word list instance
|
12
|
+
#
|
13
|
+
# @param options [Hash] Word list options
|
14
|
+
#
|
15
|
+
# @return [Birdwatcher::WordList]
|
16
|
+
def make_word_list(options = {})
|
17
|
+
Birdwatcher::WordList.new(options)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
data/lib/birdwatcher/console.rb
CHANGED
@@ -3,14 +3,17 @@ module Birdwatcher
|
|
3
3
|
include Singleton
|
4
4
|
|
5
5
|
DEFAULT_AUTO_COMPLETION_STRINGS = [].freeze
|
6
|
-
DB_MIGRATIONS_PATH
|
7
|
-
LINE_SEPARATOR
|
6
|
+
DB_MIGRATIONS_PATH = File.expand_path("../../../db/migrations", __FILE__).freeze
|
7
|
+
LINE_SEPARATOR = ("=" * 80).freeze
|
8
|
+
HISTORY_FILE_NAME = ".birdwatcher_history".freeze
|
9
|
+
HISTORY_FILE_LOCATION = File.join(Dir.home, HISTORY_FILE_NAME).freeze
|
8
10
|
|
9
|
-
attr_accessor :current_workspace, :current_module
|
11
|
+
attr_accessor :current_workspace, :current_module, :spool
|
10
12
|
attr_reader :database
|
11
13
|
|
12
14
|
def initialize
|
13
15
|
@output_mutex = Mutex.new
|
16
|
+
@spool_mutex = Mutex.new
|
14
17
|
end
|
15
18
|
|
16
19
|
def start!
|
@@ -21,7 +24,9 @@ module Birdwatcher
|
|
21
24
|
Birdwatcher::Console.instance.auto_completion_strings.grep(/\A#{Regexp.escape(s)}/) + Dir["#{expanded_s}*"].grep(/^#{Regexp.escape(expanded_s)}/)
|
22
25
|
end
|
23
26
|
Readline.completion_append_character = ""
|
27
|
+
load_command_history
|
24
28
|
while input = Readline.readline(prompt_line, true)
|
29
|
+
save_to_spool(prompt_line)
|
25
30
|
input = input.to_s.strip
|
26
31
|
handle_input(input) unless input.empty?
|
27
32
|
end
|
@@ -29,6 +34,8 @@ module Birdwatcher
|
|
29
34
|
|
30
35
|
def handle_input(input)
|
31
36
|
input.strip!
|
37
|
+
save_command_to_history(input)
|
38
|
+
save_to_spool("#{input}\n")
|
32
39
|
command_name, argument_line = input.split(" ", 2).map(&:strip)
|
33
40
|
command_name.downcase
|
34
41
|
commands.each do |command|
|
@@ -52,14 +59,15 @@ module Birdwatcher
|
|
52
59
|
def output(data, newline = true)
|
53
60
|
data = "#{data}\n" if newline
|
54
61
|
with_output_mutex { print data }
|
62
|
+
save_to_spool(data)
|
55
63
|
end
|
56
64
|
|
57
65
|
def output_formatted(*args)
|
58
|
-
|
66
|
+
output(sprintf(*args), false)
|
59
67
|
end
|
60
68
|
|
61
69
|
def newline
|
62
|
-
|
70
|
+
output ""
|
63
71
|
end
|
64
72
|
|
65
73
|
def line_separator
|
@@ -77,6 +85,7 @@ module Birdwatcher
|
|
77
85
|
rescue => e
|
78
86
|
output " failed".bold.light_red
|
79
87
|
error "#{e.class}: ".bold + e.message
|
88
|
+
e.backtrace.each { |l| error l } if debugging_enabled?
|
80
89
|
exit(1) if fatal
|
81
90
|
end
|
82
91
|
|
@@ -92,6 +101,24 @@ module Birdwatcher
|
|
92
101
|
output "[-]".white.bold.on_red + " #{message}"
|
93
102
|
end
|
94
103
|
|
104
|
+
def confirm(question)
|
105
|
+
question = "#{question} (y/n) "
|
106
|
+
save_to_spool(question)
|
107
|
+
if HighLine.agree("#{question}")
|
108
|
+
save_to_spool("y\n")
|
109
|
+
true
|
110
|
+
else
|
111
|
+
save_to_spool("n\n")
|
112
|
+
false
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
def page_text(text)
|
117
|
+
save_to_spool(text)
|
118
|
+
::TTY::Pager::SystemPager.new.page(text)
|
119
|
+
rescue Errno::EPIPE
|
120
|
+
end
|
121
|
+
|
95
122
|
def twitter_client
|
96
123
|
if !@twitter_clients
|
97
124
|
@twitter_clients = create_twitter_clients!
|
@@ -179,6 +206,10 @@ module Birdwatcher
|
|
179
206
|
@output_mutex.synchronize { yield }
|
180
207
|
end
|
181
208
|
|
209
|
+
def with_spool_mutex
|
210
|
+
@spool_mutex.synchronize { yield }
|
211
|
+
end
|
212
|
+
|
182
213
|
def create_twitter_clients!
|
183
214
|
clients = []
|
184
215
|
configuration.get!(:twitter).each do |keypair|
|
@@ -197,5 +228,41 @@ module Birdwatcher
|
|
197
228
|
end
|
198
229
|
clients
|
199
230
|
end
|
231
|
+
|
232
|
+
def load_command_history
|
233
|
+
if File.exist?(HISTORY_FILE_LOCATION)
|
234
|
+
if File.readable?(HISTORY_FILE_LOCATION)
|
235
|
+
File.open(HISTORY_FILE_LOCATION).each_line do |command|
|
236
|
+
Readline::HISTORY << command.strip
|
237
|
+
end
|
238
|
+
else
|
239
|
+
warn("Cannot load command history: #{HISTORY_FILE_LOCATION} is not readable")
|
240
|
+
end
|
241
|
+
end
|
242
|
+
end
|
243
|
+
|
244
|
+
def save_command_to_history(command)
|
245
|
+
if File.exist?(HISTORY_FILE_LOCATION) && !File.writable?(HISTORY_FILE_LOCATION)
|
246
|
+
warn("Cannot save command to history: #{HISTORY_FILE_LOCATION} is not writable")
|
247
|
+
return
|
248
|
+
end
|
249
|
+
File.open(HISTORY_FILE_LOCATION, "a") do |file|
|
250
|
+
file.puts(command)
|
251
|
+
end
|
252
|
+
end
|
253
|
+
|
254
|
+
def save_to_spool(string)
|
255
|
+
return unless spool_enabled?
|
256
|
+
string = string.to_s.uncolorize
|
257
|
+
with_spool_mutex { self.spool.write(string) }
|
258
|
+
end
|
259
|
+
|
260
|
+
def spool_enabled?
|
261
|
+
self.spool && self.spool.is_a?(File)
|
262
|
+
end
|
263
|
+
|
264
|
+
def debugging_enabled?
|
265
|
+
ENV.key?("BIRDWATCHER_DEBUG")
|
266
|
+
end
|
200
267
|
end
|
201
268
|
end
|
data/lib/birdwatcher/module.rb
CHANGED
@@ -95,8 +95,6 @@ module Birdwatcher
|
|
95
95
|
}
|
96
96
|
}
|
97
97
|
|
98
|
-
DEFAULT_EXCLUDED_WORDS = %w(rt via oh)
|
99
|
-
|
100
98
|
def self.info
|
101
99
|
<<-INFO
|
102
100
|
The Word Cloud module can generate a classic weighted word cloud from words used
|
@@ -133,37 +131,34 @@ INFO
|
|
133
131
|
error("There are no statuses to process")
|
134
132
|
return false
|
135
133
|
end
|
136
|
-
|
137
|
-
|
138
|
-
|
134
|
+
word_list = make_word_list(
|
135
|
+
:min_word_count => option_setting("MIN_WORD_COUNT"),
|
136
|
+
:min_word_length => option_setting("MIN_WORD_LENGTH"),
|
137
|
+
:exclude_words => option_setting("EXCLUDE_WORDS").to_s.split(" ").map(&:strip),
|
138
|
+
:exclude_stopwords => option_setting("EXCLUDE_STOPWORDS"),
|
139
|
+
:exclude_common_words => option_setting("EXCLUDE_COMMON"),
|
140
|
+
:exclude_hashtags => option_setting("EXCLUDE_HASHTAGS"),
|
141
|
+
:exclude_mentions => option_setting("EXCLUDE_MENTIONS"),
|
142
|
+
:word_cap => option_setting("WORD_CAP"),
|
143
|
+
:stopwords_file => File.join(DATA_DIRECTORY, "english_stopwords.txt"),
|
144
|
+
:common_words_file => File.join(DATA_DIRECTORY, "top100Kenglishwords.txt")
|
145
|
+
)
|
139
146
|
task("Processing #{statuses.count.to_s.bold} statuses...") do
|
140
147
|
statuses.each do |status|
|
141
|
-
|
142
|
-
next if exclude_word?(word)
|
143
|
-
words.key?(word) ? words[word] += 1 : words[word] = 1
|
144
|
-
end
|
148
|
+
word_list.add_to_corpus(status.text)
|
145
149
|
if option_setting("INCLUDE_PAGE_TITLES")
|
146
150
|
status.urls_dataset
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
next if exclude_word?(word)
|
152
|
-
words.key?(word) ? words[word] += 1 : words[word] = 1
|
153
|
-
end
|
151
|
+
.where("title IS NOT NULL")
|
152
|
+
.where("final_url NOT LIKE 'https://twitter.com/%'")
|
153
|
+
.map(&:title).each do |page_title|
|
154
|
+
word_list.add_to_corpus(page_title)
|
154
155
|
end
|
155
156
|
end
|
156
157
|
end
|
157
|
-
|
158
|
-
words.delete_if { |word, count| count < option_setting("MIN_WORD_COUNT").to_i }
|
159
|
-
end
|
160
|
-
sorted_words = words.sort_by { |word, count| count}.reverse
|
161
|
-
if option_setting("WORD_CAP")
|
162
|
-
sorted_words = sorted_words.take(option_setting("WORD_CAP").to_i)
|
163
|
-
end
|
158
|
+
word_list.process
|
164
159
|
end
|
165
160
|
task("Generating word cloud, patience please...") do
|
166
|
-
cloud = MagicCloud::Cloud.new(
|
161
|
+
cloud = MagicCloud::Cloud.new(word_list.word_list,
|
167
162
|
:rotate => :none,
|
168
163
|
:palette => option_setting("PALETTE").split(" ").map(&:strip)
|
169
164
|
).draw(option_setting("IMAGE_WIDTH").to_i, option_setting("IMAGE_HEIGHT").to_i).to_blob { self.format = "png" }
|
@@ -171,34 +166,6 @@ INFO
|
|
171
166
|
end
|
172
167
|
info("Word cloud written to #{option_setting('DEST').bold}")
|
173
168
|
end
|
174
|
-
|
175
|
-
private
|
176
|
-
|
177
|
-
def prepare_exclusion_list
|
178
|
-
@exclusion_list = DEFAULT_EXCLUDED_WORDS
|
179
|
-
if option_setting("EXCLUDE_WORDS")
|
180
|
-
@exclusion_list += option_setting("EXCLUDE_WORDS").split(" ").map { |w| w.strip.downcase }
|
181
|
-
end
|
182
|
-
if option_setting("EXCLUDE_STOPWORDS")
|
183
|
-
@exclusion_list += read_data_file("english_stopwords.txt").split("\n").map { |w| w.strip.downcase }
|
184
|
-
end
|
185
|
-
if option_setting("EXCLUDE_COMMON")
|
186
|
-
@exclusion_list += read_data_file("top100Kenglishwords.txt").split("\n").map(&:strip)
|
187
|
-
end
|
188
|
-
end
|
189
|
-
|
190
|
-
def exclude_word?(word)
|
191
|
-
return true if word.empty?
|
192
|
-
return true if option_setting("MIN_WORD_LENGTH") && word.length < option_setting("MIN_WORD_LENGTH").to_i
|
193
|
-
return true if option_setting("EXCLUDE_HASHTAGS") && word.start_with?("#")
|
194
|
-
return true if option_setting("EXCLUDE_MENTIONS") && word.start_with?("@")
|
195
|
-
return true if @exclusion_list.include?(word)
|
196
|
-
end
|
197
|
-
|
198
|
-
def split_into_words(text)
|
199
|
-
text = text.downcase.strip.gsub(/https?:\/\/[\S]+/, "").gsub(/[^0-9a-z@#_ ]/i, " ")
|
200
|
-
text.split(" ").map(&:strip)
|
201
|
-
end
|
202
169
|
end
|
203
170
|
end
|
204
171
|
end
|
@@ -0,0 +1,145 @@
|
|
1
|
+
module Birdwatcher
|
2
|
+
module Modules
|
3
|
+
module Statuses
|
4
|
+
class Wordlist < Birdwatcher::Module
|
5
|
+
self.meta = {
|
6
|
+
:name => "Word List",
|
7
|
+
:description => "Generates a word list from statuses",
|
8
|
+
:author => "Michael Henriksen <michenriksen@neomailbox.ch>",
|
9
|
+
:options => {
|
10
|
+
"DEST" => {
|
11
|
+
:value => nil,
|
12
|
+
:description => "Destination file",
|
13
|
+
:required => true
|
14
|
+
},
|
15
|
+
"USERS" => {
|
16
|
+
:value => nil,
|
17
|
+
:description => "Space-separated list of screen names (all users if empty)",
|
18
|
+
:required => false
|
19
|
+
},
|
20
|
+
"MIN_WORD_COUNT" => {
|
21
|
+
:value => 3,
|
22
|
+
:description => "Exclude words mentioned fewer times than specified",
|
23
|
+
:required => false
|
24
|
+
},
|
25
|
+
"MIN_WORD_LENGTH" => {
|
26
|
+
:value => 6,
|
27
|
+
:description => "Exclude words smaller than specified",
|
28
|
+
:required => false
|
29
|
+
},
|
30
|
+
"EXCLUDE_STOPWORDS" => {
|
31
|
+
:value => true,
|
32
|
+
:description => "Exclude english stopwords",
|
33
|
+
:required => false,
|
34
|
+
:boolean => true
|
35
|
+
},
|
36
|
+
"EXCLUDE_COMMON" => {
|
37
|
+
:value => true,
|
38
|
+
:description => "Exclude common english words",
|
39
|
+
:required => false,
|
40
|
+
:boolean => true
|
41
|
+
},
|
42
|
+
"EXCLUDE_WORDS" => {
|
43
|
+
:value => nil,
|
44
|
+
:description => "Space-separated list of words to exclude",
|
45
|
+
:required => false
|
46
|
+
},
|
47
|
+
"EXCLUDE_HASHTAGS" => {
|
48
|
+
:value => true,
|
49
|
+
:description => "Exclude Hashtags",
|
50
|
+
:required => false,
|
51
|
+
:boolean => true
|
52
|
+
},
|
53
|
+
"EXCLUDE_MENTIONS" => {
|
54
|
+
:value => true,
|
55
|
+
:description => "Exclude @username mentions",
|
56
|
+
:required => false,
|
57
|
+
:boolean => true
|
58
|
+
},
|
59
|
+
"INCLUDE_PAGE_TITLES" => {
|
60
|
+
:value => false,
|
61
|
+
:description => "Include web page titles from shared URLs (requires crawling with urls/crawl)",
|
62
|
+
:required => false,
|
63
|
+
:boolean => true
|
64
|
+
},
|
65
|
+
"WORD_CAP" => {
|
66
|
+
:value => nil,
|
67
|
+
:description => "Cap list of words to specified amount",
|
68
|
+
:required => false
|
69
|
+
},
|
70
|
+
"INCLUDE_COUNT" => {
|
71
|
+
:value => false,
|
72
|
+
:description => "Include the count with the words",
|
73
|
+
:required => false,
|
74
|
+
:boolean => true
|
75
|
+
}
|
76
|
+
}
|
77
|
+
}
|
78
|
+
|
79
|
+
def self.info
|
80
|
+
<<-INFO
|
81
|
+
The Word List module can generate a simple word list or dictionary from words
|
82
|
+
used in statuses across all or specific users.
|
83
|
+
|
84
|
+
Since users Tweet about their hobbies, interests, work, etc. generating a word
|
85
|
+
list from statuses can be very effective for password cracking.
|
86
|
+
INFO
|
87
|
+
end
|
88
|
+
|
89
|
+
def run
|
90
|
+
if option_setting("USERS")
|
91
|
+
screen_names = option_setting("USERS").split(" ").map(&:strip)
|
92
|
+
user_ids = current_workspace.users_dataset.where("screen_name IN ?", screen_names).map(&:id)
|
93
|
+
statuses = current_workspace.statuses_dataset.where("user_id IN ?", user_ids)
|
94
|
+
else
|
95
|
+
statuses = current_workspace.statuses_dataset
|
96
|
+
end
|
97
|
+
if statuses.count.zero?
|
98
|
+
error("There are no statuses to process")
|
99
|
+
return false
|
100
|
+
end
|
101
|
+
word_list = make_word_list(
|
102
|
+
:min_word_count => option_setting("MIN_WORD_COUNT"),
|
103
|
+
:min_word_length => option_setting("MIN_WORD_LENGTH"),
|
104
|
+
:exclude_words => option_setting("EXCLUDE_WORDS").to_s.split(" ").map(&:strip),
|
105
|
+
:exclude_stopwords => option_setting("EXCLUDE_STOPWORDS"),
|
106
|
+
:exclude_common_words => option_setting("EXCLUDE_COMMON"),
|
107
|
+
:exclude_hashtags => option_setting("EXCLUDE_HASHTAGS"),
|
108
|
+
:exclude_mentions => option_setting("EXCLUDE_MENTIONS"),
|
109
|
+
:word_cap => option_setting("WORD_CAP"),
|
110
|
+
:stopwords_file => File.join(DATA_DIRECTORY, "english_stopwords.txt"),
|
111
|
+
:common_words_file => File.join(DATA_DIRECTORY, "top100Kenglishwords.txt")
|
112
|
+
)
|
113
|
+
task("Processing #{statuses.count.to_s.bold} statuses...") do
|
114
|
+
statuses.each do |status|
|
115
|
+
word_list.add_to_corpus(status.text)
|
116
|
+
if option_setting("INCLUDE_PAGE_TITLES")
|
117
|
+
status.urls_dataset
|
118
|
+
.where("title IS NOT NULL")
|
119
|
+
.where("final_url NOT LIKE 'https://twitter.com/%'")
|
120
|
+
.map(&:title).each do |page_title|
|
121
|
+
word_list.add_to_corpus(page_title)
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
125
|
+
word_list.process
|
126
|
+
end
|
127
|
+
task("Writing #{pluralize(word_list.word_list.length, 'word', 'words')} to file...") do
|
128
|
+
File.open(option_setting("DEST"), "w") do |f|
|
129
|
+
word_list.word_list.each do |word_and_count|
|
130
|
+
word, count = word_and_count
|
131
|
+
if option_setting("INCLUDE_COUNT")
|
132
|
+
f.puts("#{word}, #{count}")
|
133
|
+
else
|
134
|
+
f.puts(word)
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|
138
|
+
end
|
139
|
+
file_size = number_to_human_size(File.size(option_setting("DEST")))
|
140
|
+
info("Wrote #{file_size.bold} to #{option_setting('DEST').bold}")
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
145
|
+
end
|
data/lib/birdwatcher/version.rb
CHANGED
@@ -0,0 +1,66 @@
|
|
1
|
+
module Birdwatcher
|
2
|
+
class WordList
|
3
|
+
attr_reader :options, :corpus, :word_list
|
4
|
+
|
5
|
+
def initialize(options)
|
6
|
+
@options = options
|
7
|
+
@corpus = []
|
8
|
+
@word_list = {}
|
9
|
+
end
|
10
|
+
|
11
|
+
def add_to_corpus(text)
|
12
|
+
@corpus << text.to_s
|
13
|
+
end
|
14
|
+
|
15
|
+
def process
|
16
|
+
words = {}
|
17
|
+
corpus.each do |text|
|
18
|
+
normalize_and_split(text).each do |word|
|
19
|
+
next if exclude_word?(word)
|
20
|
+
words.key?(word) ? words[word] += 1 : words[word] = 1
|
21
|
+
end
|
22
|
+
end
|
23
|
+
if options[:min_word_count]
|
24
|
+
words.delete_if { |word, count| count < options[:min_word_count].to_i }
|
25
|
+
end
|
26
|
+
sorted_words = words.sort_by { |word, count| count }.reverse
|
27
|
+
if options[:word_cap]
|
28
|
+
sorted_words = sorted_words.take(options[:word_cap].to_i)
|
29
|
+
end
|
30
|
+
@word_list = sorted_words
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
|
35
|
+
def exclusion_list
|
36
|
+
if !@exclusion_list
|
37
|
+
@exclusion_list = options[:exclude_words] || []
|
38
|
+
if options[:stopwords_file] && options[:exclude_stopwords]
|
39
|
+
@exclusion_list += File.read(options[:stopwords_file]).split("\n").map do |w|
|
40
|
+
w.strip.downcase
|
41
|
+
end
|
42
|
+
end
|
43
|
+
if options[:common_words_file] && options[:exclude_common_words]
|
44
|
+
@exclusion_list += File.read(options[:common_words_file]).split("\n").map do |w|
|
45
|
+
w.strip.downcase
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
@exclusion_list
|
50
|
+
end
|
51
|
+
|
52
|
+
def normalize_and_split(text)
|
53
|
+
text = text.downcase.strip.gsub(/https?:\/\/[\S]+/, "").gsub(/[^0-9a-z@#_ ]/i, " ")
|
54
|
+
text.split(" ").map(&:strip)
|
55
|
+
end
|
56
|
+
|
57
|
+
def exclude_word?(word)
|
58
|
+
return true if word.empty?
|
59
|
+
return true if options[:min_word_length] && word.length < options[:min_word_length]
|
60
|
+
return true if options[:exclude_hashtags] && word.start_with?("#")
|
61
|
+
return true if options[:exclude_mentions] && word.start_with?("@")
|
62
|
+
return true if exclusion_list.include?(word)
|
63
|
+
false
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: birdwatcher
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Michael Henrikesn
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-11-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: sequel
|
@@ -337,6 +337,7 @@ files:
|
|
337
337
|
- lib/birdwatcher/commands/set.rb
|
338
338
|
- lib/birdwatcher/commands/shell.rb
|
339
339
|
- lib/birdwatcher/commands/show.rb
|
340
|
+
- lib/birdwatcher/commands/spool.rb
|
340
341
|
- lib/birdwatcher/commands/status.rb
|
341
342
|
- lib/birdwatcher/commands/unset.rb
|
342
343
|
- lib/birdwatcher/commands/use.rb
|
@@ -348,6 +349,7 @@ files:
|
|
348
349
|
- lib/birdwatcher/concerns/persistence.rb
|
349
350
|
- lib/birdwatcher/concerns/presentation.rb
|
350
351
|
- lib/birdwatcher/concerns/util.rb
|
352
|
+
- lib/birdwatcher/concerns/word_list.rb
|
351
353
|
- lib/birdwatcher/configuration.rb
|
352
354
|
- lib/birdwatcher/configuration_wizard.rb
|
353
355
|
- lib/birdwatcher/console.rb
|
@@ -360,6 +362,7 @@ files:
|
|
360
362
|
- lib/birdwatcher/modules/statuses/kml.rb
|
361
363
|
- lib/birdwatcher/modules/statuses/sentiment.rb
|
362
364
|
- lib/birdwatcher/modules/statuses/word_cloud.rb
|
365
|
+
- lib/birdwatcher/modules/statuses/word_list.rb
|
363
366
|
- lib/birdwatcher/modules/urls/crawl.rb
|
364
367
|
- lib/birdwatcher/modules/urls/most_shared.rb
|
365
368
|
- lib/birdwatcher/modules/users/activity_plot.rb
|
@@ -373,6 +376,7 @@ files:
|
|
373
376
|
- lib/birdwatcher/punchcard.rb
|
374
377
|
- lib/birdwatcher/util.rb
|
375
378
|
- lib/birdwatcher/version.rb
|
379
|
+
- lib/birdwatcher/word_list.rb
|
376
380
|
- models/hashtag.rb
|
377
381
|
- models/influencee.rb
|
378
382
|
- models/influencer.rb
|