hacker-curse 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,24 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'hacker/curse/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "hacker-curse"
8
+ spec.version = Hacker::Curse::VERSION
9
+ spec.authors = ["kepler"]
10
+ spec.email = ["githubkepler.50s@gishpuppy.com"]
11
+ spec.summary = %q{View hacker news and reddit articles on terminal using ncurses}
12
+ spec.description = %q{View Hacker News and reddit articles on terminal using ncurses}
13
+ spec.homepage = "https://github.com/mare-imbrium/hacker-curse"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.6"
22
+ spec.add_development_dependency "rake", ">= 0.9.6"
23
+ spec.add_runtime_dependency "canis", ">= 0.0.3", ">= 0.0.3"
24
+ end
@@ -0,0 +1,7 @@
1
+ require "hacker/curse/version"
2
+
3
+ module Hacker
4
+ module Curse
5
+ # Your code goes here...
6
+ end
7
+ end
@@ -0,0 +1,353 @@
1
+ #!/usr/bin/env ruby -w
2
+ #
3
+ # Fetch hacker news front page entries into a hash.
4
+ # TODO : get next page. Nexts is /news2 but after that it changes
5
+ # TODO : 2014-07-27 - 12:42 put items in hash in an order, so printers can use first 4 cols for long listing
6
+ # title, age_text, comment_count, points, article_url, comments_url, age, submitter, submitter_url
7
+ #
8
+ require 'open-uri'
9
+ require 'nokogiri'
10
+
11
+ # this is from hacker news itself
12
+ #file = "news.html"
13
+
14
+ module HackerCurse
15
+ class ForumPage
16
+ include Enumerable
17
+ # new newest hot rising etc
18
+ attr_accessor :url
19
+ attr_accessor :next_url
20
+ attr_accessor :create_date
21
+ attr_accessor :subforum
22
+ # array of article objects
23
+ attr_accessor :articles
24
+ def each
25
+ @articles.each do |e| yield(e) ; end
26
+ end
27
+ alias :each_article :each
28
+ def merge_page page
29
+ self.next_url = page.next_url
30
+ self.articles.push(*page.articles)
31
+ self
32
+ end
33
+ end
34
+ class ForumArticle
35
+ attr_accessor :title
36
+ attr_accessor :article_url
37
+ attr_accessor :points
38
+ attr_accessor :comment_count
39
+ attr_accessor :comments_url
40
+ attr_accessor :age_text
41
+ attr_accessor :age
42
+ attr_accessor :submitter
43
+ attr_accessor :submitter_url
44
+ attr_accessor :domain
45
+ attr_accessor :domain_url
46
+ # byline is dump of text on top containing all the info on points, # of comments, nn hours aga
47
+ attr_accessor :byline
48
+ attr_accessor :parent
49
+ attr_writer :comments
50
+ attr_reader :hash
51
+ def initialize h
52
+ @comments = nil
53
+ @hash = h
54
+ [:title, :article_url, :points, :comment_count, :comments_url, :age_text, :age,
55
+ :submitter, :submitter_url, :domain, :domain_url, :byline].each do |sym|
56
+ instance_variable_set("@#{sym.to_s}", h[sym]) if h.key? sym
57
+ end
58
+ if h.key? :comments
59
+ c = h[:comments]
60
+ @comments = Array.new
61
+ c.each do |h|
62
+ fc = ForumComment.new h
63
+ @comments << fc
64
+ end
65
+ end
66
+ end
67
+
68
+ def comments
69
+ @comments || retrieve_comments(@comments_url)
70
+ end
71
+ def each
72
+ comments.each do |e| yield(e) ; end
73
+ end
74
+ def retrieve_comments url
75
+ raise "Parent must be set in order to retrieve comments " unless @parent
76
+ @parent._retrieve_comments url
77
+ end
78
+ alias :each_comment :each
79
+ def [](sym)
80
+ @hash[sym]
81
+ end
82
+ def keys
83
+ @hash.keys
84
+ end
85
+ def values
86
+ @hash.values
87
+ end
88
+ end
89
+ class ForumComment
90
+ attr_accessor :submitter, :submitter_url
91
+ attr_accessor :age, :age_text, :points, :head
92
+ attr_accessor :comment_text
93
+ attr_accessor :comment_url
94
+ attr_reader :hash
95
+ def initialize h
96
+
97
+ @hash = h
98
+ [:points, :comment_url, :age_text, :age,
99
+ :submitter, :submitter_url, :comment_text, :head].each do |sym|
100
+ instance_variable_set("@#{sym.to_s}", h[sym])
101
+ end
102
+ end
103
+ def [](sym)
104
+ @hash[sym]
105
+ end
106
+ def keys
107
+ @hash.keys
108
+ end
109
+ def values
110
+ @hash.values
111
+ end
112
+ end
113
+
114
+ #
115
+ # rn = RNParser.new [url]
116
+ # rn.subreddit = "ruby"
117
+ # resultset = rn.get_next_page :page => prevresultset, :number => 5
118
+ # resultset.each do |art|
119
+ # art.title, art.points
120
+ # art.comments
121
+ # end
122
+ #
123
+ # hn = HNewsParser @options
124
+ # hn.subxxx = "news" / "newest"
125
+ #
126
+ # redditnews.rb -s ruby --pages 2
127
+ # hackernews.rb -s newest --pages 2 -d '|'
128
+ #
129
+
130
+ class AbstractSiteParser
131
+ attr_reader :more_url
132
+ attr_accessor :host
133
+ attr_accessor :num_pages
134
+ attr_accessor :subforum
135
+ # should the html be saved
136
+ attr_accessor :save_html
137
+ attr_accessor :htmloutfile
138
+ #HOST = "https://news.ycombinator.com"
139
+ def initialize options={}
140
+ @options = options
141
+ @url = @options[:url]
142
+ @save_html = @options[:save_html]
143
+ @htmloutfile = @options[:htmloutfile]
144
+ @num_pages = @options[:num_pages] || 1
145
+ @more_url = nil
146
+ #puts "initialize: url is #{@url} "
147
+ end
148
+ def get_first_page
149
+ #@arr = to_hash @url
150
+ page = _retrieve_page @url
151
+ end
152
+ def get_next_page opts={}
153
+ page = opts[:page]
154
+ num_pages = opts[:num_pages] || @num_pages
155
+ num_pages ||= 1
156
+ u = @more_url || @url
157
+ if page
158
+ u = page.next_url
159
+ end
160
+ pages = nil
161
+ num_pages.times do |i|
162
+ page = _retrieve_page u
163
+ if pages.nil?
164
+ pages = page
165
+ else
166
+ pages.merge_page page
167
+ end
168
+ u = page.next_url
169
+ break unless u # sometimes there is no next
170
+ @more_url = u
171
+ end
172
+ return pages
173
+ end
174
+ alias :get_next :get_next_page
175
+ def _retrieve_page url
176
+ raise "must be implemented by concrete class"
177
+ end
178
+ # write as yml, this doesn't work if multiple pages since we call x times
179
+ # so previous is overwritten
180
+ # This should be called with final class
181
+ def to_yml outfile, arr = @arr
182
+ require 'yaml'
183
+ # cannot just convert / to __ in filename since path gets converted too
184
+ #if outfile.index("/")
185
+ #outfile = outfile.gsub("/","__")
186
+ #end
187
+ File.open(outfile, 'w' ) do |f|
188
+ f << YAML::dump(arr)
189
+ end
190
+ end
191
+ # after called get_next_page, one may pass its return value
192
+ # to this method to convert it into an array of hashes and store it as a yml file
193
+ # It's a bit silly, first we break the hash down into this structure
194
+ # and then deconstruct the whole thing.
195
+ def save_page_as_yml outputfile, page
196
+ h = {}
197
+ h[:url] = page.url
198
+ h[:next_url] = page.next_url
199
+ h[:subforum] = page.subforum
200
+ h[:create_date] = page.create_date
201
+ articles = []
202
+ page.each do |a| articles << a.hash; end
203
+
204
+ h[:articles] = articles
205
+
206
+ to_yml outputfile, h
207
+ end
208
+ # retrieves the comments for a url and stores in outputfile in YML format
209
+ def save_comments_as_yml outputfile, url
210
+ pages = _retrieve_comments url
211
+ if pages
212
+ to_yml outputfile, pages.hash
213
+ end
214
+ end
215
+ # returns nokogiri html doc and writes html is required.
216
+ def get_doc_for_url url
217
+ #puts "get_doc #{url} "
218
+ out = open(url)
219
+ doc = Nokogiri::HTML(out)
220
+ if @save_html
221
+ subforum = @subforum || "unknown"
222
+ outfile = @htmloutfile || "#{subforum}.html"
223
+ #if !File.exists? url
224
+ out.rewind
225
+ File.open(outfile, 'w') {|f| f.write(out.read) }
226
+ #end
227
+ end
228
+ return doc
229
+ end
230
+ # this is a test method so we don't keep hitting HN while testing out and getting IP blocked.
231
+ def load_from_yml filename="hn.yml"
232
+ @arr = YAML::load( File.open( filename ) )
233
+ next_url = @arr.last[:article_url]
234
+ unless next_url.index("http")
235
+ next_url = @host + "/" + next_url
236
+ end
237
+ @more_url = next_url
238
+ end
239
+ def _retrieve_comments url
240
+ raise "Must be implemented by concrete class "
241
+ end
242
+ public
243
+ def get_comments_url index
244
+ arr = @arr
245
+ entry = arr[index]
246
+ if entry
247
+ if entry.key? :comments_url
248
+ return entry[:comments_url]
249
+ end
250
+ end
251
+ return nil
252
+ end
253
+ public
254
+ def get_comments index
255
+ url = get_comments_url index
256
+ if url
257
+ #puts url
258
+ comments = convert_comment_url url
259
+ return comments
260
+ #else
261
+ #puts "Sorry no url for #{index} "
262
+ end
263
+ return []
264
+ end
265
+ alias :get_comments_for_link :get_comments
266
+ def human_age_to_unix age_text
267
+ i = age_text.to_i
268
+ ff=1
269
+ if age_text.index("hour")
270
+ i *= ff*60*60
271
+ elsif age_text.index("second")
272
+ i *= ff
273
+ elsif age_text.index("minute")
274
+ i *= ff*60
275
+ elsif age_text.index("day")
276
+ i *= ff*60*60*24
277
+ elsif age_text.index("month")
278
+ i *= ff*60*60*24*30
279
+ elsif age_text.index("week")
280
+ i *= ff*60*60*24*7
281
+ elsif age_text.index("year")
282
+ i *= ff*60*60*24*365
283
+ else
284
+ #raise "don't know how to convert #{age_text} "
285
+ return 0
286
+ end
287
+ return (Time.now.to_i - i)
288
+ end
289
+ end
290
+ end
291
+ include HackerCurse
292
+
293
+
294
+ if __FILE__ == $0
295
+ #rn = HackerNewsParser.new :url => "hackernews.html"
296
+ rn = RedditNewsParser.new :url => "reddit-prog.html"
297
+
298
+ page = rn.get_next_page # [page if supplied, take page.next_url, otherwise store??]
299
+ puts "For each article :::"
300
+ page.each do |art|
301
+ puts art.title, art.points, art.age_text, art.age, Time.at(art.age)
302
+ end # each_article
303
+ art = page.articles.first
304
+ puts "PRINTING comments "
305
+ art.each_comment do |c|
306
+ puts
307
+ puts " ======"
308
+ puts c.head
309
+ s = nil
310
+ if c.age
311
+ s = Time.at(c.age)
312
+ end
313
+ puts " #{c.age_text} | #{c.submitter} | #{c.age} . #{s} "
314
+ puts c.comment_text
315
+ end
316
+
317
+ exit
318
+ articles = page.articles
319
+ co = articles.first.comments
320
+ puts "PRINTING comments "
321
+ puts co[:title], co[:subtext]
322
+ comments = co[:comments]
323
+ comments.each_with_index do |c,i|
324
+ puts "======= #{c[:head]} : "
325
+ puts " - #{c[:head]} : "
326
+ puts " #{c[:comment]} "
327
+ puts " "
328
+ end
329
+
330
+ #comments.each_with_index do |c,i|
331
+ #puts " #{i}: #{c} "
332
+ #end
333
+ exit
334
+ art.each_comment do |cc|
335
+ end
336
+ #rn.next_url = page.next_url
337
+ rn.set_next_url(page)
338
+ #arr = rn.convert_comment_url "hn_comments.html"
339
+ #rn.to_yml "hn_comments.yml", arr
340
+
341
+
342
+ arr = rn.get_next_page
343
+ rn.to_yml "hn.yml"
344
+ puts "getting comments for link 1"
345
+ comments = rn.get_comments_for_link 1
346
+ if comments.empty?
347
+ comments = rn.get_comments_for_link 9
348
+ end
349
+ rn.to_yml "hn-comments.yml", comments
350
+ puts "getting next page"
351
+ arr1 = rn.get_next_page
352
+ rn.to_yml "hn-1.yml", arr1
353
+ end
@@ -0,0 +1,226 @@
1
+ require 'hacker/curse/abstractsiteparser'
2
+
3
+ module HackerCurse
4
+
5
+ class HackerNewsParser < AbstractSiteParser
6
+ def initialize config={}
7
+ @host = config[:host] || "https://news.ycombinator.com"
8
+ subforum = config[:subforum] || "news"
9
+ _url="#{@host}/#{subforum}"
10
+ @subforum = subforum
11
+ config[:url] ||= _url
12
+ super config
13
+ end
14
+ def _retrieve_page url
15
+ #puts "got url #{url} "
16
+ raise "url should be string" unless url.is_a? String
17
+ arr = to_hash url
18
+ page = hash_to_class arr
19
+ #to_yml "#{@subforum}.yml", arr
20
+ return page
21
+ end
22
+ # currently returns a Hash. containing various entries relating to the main article
23
+ # which can be avoiced.
24
+ # Contains an array :comments which contains hashes, :head contains text of head, :comment contains
25
+ # text of comment, and then there are entries for submitter.
26
+ # hash[:comments].each do |e| e[:comment] ; end
27
+ # @return Array of ForumComment objects.
28
+ # pages.each do |co| puts co.comment_text, co.head; end
29
+ def _retrieve_comments url
30
+ arr = to_hash_comment url
31
+ # TODO break head into points age etc
32
+ pages = hash_to_comment_class arr
33
+ return pages
34
+ end
35
+ def hash_to_comment_class arr
36
+ page = ForumArticle.new arr
37
+ return page
38
+ end
39
+ def oldhash_to_comment_class arr
40
+ co = arr[:comments]
41
+ pages = Array.new
42
+ co.each do |h|
43
+ page = ForumComment.new h
44
+ pages << page
45
+ end
46
+ return pages
47
+ end
48
+ def to_hash_comment url
49
+ # for testing i may send in a saved file, so i don't keep hitting HN
50
+ if !File.exists? url
51
+ unless url.index("http")
52
+ url = @host + "/" + url
53
+ end
54
+ end
55
+ page = Nokogiri::HTML(open(url))
56
+ h = {}
57
+ title = page.css("td.title")
58
+ article_url = title.css("a").first["href"]
59
+ h[:title] = title.text
60
+ h[:article_url] = article_url
61
+
62
+ subtext = page.css("td.subtext")
63
+ h[:byline] = subtext.text
64
+ # TODO extract age_text
65
+ h[:age_text] = subtext.text.scan(/\d+ \w+ ago/).first
66
+ score = subtext.css("span").text
67
+ h[:points] = score
68
+ subtext.css("a").each_with_index do |e, i|
69
+ link = e["href"]
70
+ text = e.text
71
+ if link.index("user") == 0
72
+ h[:submitter] = text
73
+ h[:submitter_url] = link
74
+ elsif link.index("item") == 0
75
+ h[:comment_count] = text
76
+ h[:comments_url] = link
77
+ end
78
+ end
79
+
80
+ # need to get points
81
+ comheads = page.css("span.comhead") # .collect do |e| e.text ; end
82
+ comments = page.css("span.comment").collect do |e| e.text ; end
83
+ comheads.delete(comheads.first)
84
+ # array of comments
85
+ carr = Array.new
86
+ comheads.zip(comments) do |head,c|
87
+ hh={}; hh[:head] = head.text;
88
+ #$stderr.puts "head:: #{head.text}"
89
+ m = head.text.scan(/\d+ \w+ ago/)
90
+ if !m.empty?
91
+ hh[:age_text] = m.first.scan(/\d+ \w/).first.rjust(4)
92
+ hh[:age] = human_age_to_unix(m.first)
93
+ head.css("a").each_with_index do |e, i|
94
+ link = e["href"]
95
+ text = e.text
96
+ if link.index("user") == 0
97
+ hh[:submitter] = text
98
+ hh[:submitter_url] = link
99
+ elsif link.index("item") == 0
100
+ hh[:text] = text
101
+ hh[:comment_url] = link
102
+ end
103
+ end
104
+ end
105
+ hh[:comment_text]=c;
106
+ carr << hh
107
+ end
108
+
109
+ h[:comments] = carr
110
+ return h
111
+ end
112
+ def hash_to_class h
113
+ p = ForumPage.new
114
+ p.url = h[:url]
115
+ p.next_url = h[:next_url]
116
+ p.create_date = h[:create_date]
117
+ p.subforum = h[:subforum]
118
+ art = h[:articles]
119
+ arts = []
120
+ art.each do |a|
121
+ fa = ForumArticle.new a
122
+ fa.parent = self
123
+ arts << fa
124
+ end
125
+ p.articles = arts
126
+ return p
127
+ end
128
+ # convert the front page to a hash
129
+ def to_hash url
130
+ doc = get_doc_for_url url
131
+ count = 0
132
+ page = {}
133
+ page[:url] = url
134
+ now = Time.now
135
+ page[:create_date_seconds] = now.to_i
136
+ page[:create_date] = now
137
+ page[:subforum] = @subforum
138
+
139
+ arr = Array.new
140
+ h = {}
141
+ links = doc.xpath("//table/tr/td/table/tr")
142
+ links.each_with_index do |li, i|
143
+ x = li.css("td.title")
144
+ if !x.empty?
145
+ #puts " ---- title ----- #{x.count} "
146
+ count = x[0].text
147
+ #puts count
148
+ if x.count < 2
149
+ # this block is for the next_url
150
+ article_url = x[0].css("a")[0]["href"] # link url
151
+ #puts article_url
152
+ h = {}
153
+ h[:title] = count
154
+ h[:article_url] = article_url
155
+ more = count
156
+ more_url = "#{@host}/#{article_url}"
157
+ #arr << h
158
+ page[:next_url] = more_url
159
+ #puts li
160
+ end
161
+ break if x.count < 2
162
+
163
+ # actual article url
164
+ title = x[1].css("a")[0].text # title
165
+ article_url = x[1].css("a")[0]["href"] # link url
166
+ #puts article_url
167
+ #puts title
168
+ h = {}
169
+ #h[:number] = count
170
+ h[:title] = title
171
+ # ask option does not have hostname since it is relative to HN
172
+ if article_url.index("http") != 0
173
+ article_url = "#{@host}/#{article_url}"
174
+ end
175
+
176
+ h[:article_url] = article_url
177
+ arr << h
178
+ else
179
+ x = li.css("td.subtext")
180
+ if !x.empty?
181
+ fulltext = x.text
182
+ #puts " ---- subtext ----- (#{fulltext})"
183
+ submitter = nil
184
+ submitter_url = nil
185
+ comment = nil
186
+ comments_url = nil
187
+ t = x.css("a")
188
+ t.each_with_index do |tt, ii|
189
+ case ii
190
+ when 0
191
+ submitter = tt.text
192
+ submitter_url = tt["href"]
193
+ when 1
194
+ comment = tt.text
195
+ comments_url = tt["href"]
196
+ comments_url = "#{@host}/#{comments_url}"
197
+ end
198
+ end
199
+ points = x.css("span").text rescue ""
200
+ #puts submitter
201
+ #puts submitter_url
202
+ #puts comment
203
+ #puts comments_url
204
+ #puts points
205
+ h[:submitter] = submitter
206
+ h[:submitter_url] = submitter_url
207
+ h[:comment_count] = comment.to_i.to_s.rjust(4)
208
+ h[:comments_url] = comments_url
209
+ h[:points] = points.to_i.to_s.rjust(4)
210
+ m = fulltext.scan(/\d+ \w+ ago/)
211
+ if m
212
+ #h[:age_text] = m.first
213
+ h[:age_text] = m.first.scan(/\d+ \w/).first.rjust(4)
214
+ h[:age] = human_age_to_unix(m.first)
215
+ end
216
+ #puts "fulltext: #{fulltext} "
217
+ h[:byline] = fulltext
218
+ end
219
+ end
220
+ end
221
+ #return arr
222
+ page[:articles] = arr
223
+ return page
224
+ end
225
+ end # class
226
+ end # module