hacker-curse 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,24 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'hacker/curse/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "hacker-curse"
8
+ spec.version = Hacker::Curse::VERSION
9
+ spec.authors = ["kepler"]
10
+ spec.email = ["githubkepler.50s@gishpuppy.com"]
11
+ spec.summary = %q{View hacker news and reddit articles on terminal using ncurses}
12
+ spec.description = %q{View Hacker News and reddit articles on terminal using ncurses}
13
+ spec.homepage = "https://github.com/mare-imbrium/hacker-curse"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.6"
22
+ spec.add_development_dependency "rake", ">= 0.9.6"
23
+ spec.add_runtime_dependency "canis", ">= 0.0.3", ">= 0.0.3"
24
+ end
@@ -0,0 +1,7 @@
1
+ require "hacker/curse/version"
2
+
3
+ module Hacker
4
+ module Curse
5
+ # Your code goes here...
6
+ end
7
+ end
@@ -0,0 +1,353 @@
1
+ #!/usr/bin/env ruby -w
2
+ #
3
+ # Fetch hacker news front page entries into a hash.
4
+ # TODO : get next page. Nexts is /news2 but after that it changes
5
+ # TODO : 2014-07-27 - 12:42 put items in hash in an order, so printers can use first 4 cols for long listing
6
+ # title, age_text, comment_count, points, article_url, comments_url, age, submitter, submitter_url
7
+ #
8
+ require 'open-uri'
9
+ require 'nokogiri'
10
+
11
+ # this is from hacker news itself
12
+ #file = "news.html"
13
+
14
+ module HackerCurse
15
+ class ForumPage
16
+ include Enumerable
17
+ # new newest hot rising etc
18
+ attr_accessor :url
19
+ attr_accessor :next_url
20
+ attr_accessor :create_date
21
+ attr_accessor :subforum
22
+ # array of article objects
23
+ attr_accessor :articles
24
+ def each
25
+ @articles.each do |e| yield(e) ; end
26
+ end
27
+ alias :each_article :each
28
+ def merge_page page
29
+ self.next_url = page.next_url
30
+ self.articles.push(*page.articles)
31
+ self
32
+ end
33
+ end
34
+ class ForumArticle
35
+ attr_accessor :title
36
+ attr_accessor :article_url
37
+ attr_accessor :points
38
+ attr_accessor :comment_count
39
+ attr_accessor :comments_url
40
+ attr_accessor :age_text
41
+ attr_accessor :age
42
+ attr_accessor :submitter
43
+ attr_accessor :submitter_url
44
+ attr_accessor :domain
45
+ attr_accessor :domain_url
46
+ # byline is dump of text on top containing all the info on points, # of comments, nn hours aga
47
+ attr_accessor :byline
48
+ attr_accessor :parent
49
+ attr_writer :comments
50
+ attr_reader :hash
51
+ def initialize h
52
+ @comments = nil
53
+ @hash = h
54
+ [:title, :article_url, :points, :comment_count, :comments_url, :age_text, :age,
55
+ :submitter, :submitter_url, :domain, :domain_url, :byline].each do |sym|
56
+ instance_variable_set("@#{sym.to_s}", h[sym]) if h.key? sym
57
+ end
58
+ if h.key? :comments
59
+ c = h[:comments]
60
+ @comments = Array.new
61
+ c.each do |h|
62
+ fc = ForumComment.new h
63
+ @comments << fc
64
+ end
65
+ end
66
+ end
67
+
68
+ def comments
69
+ @comments || retrieve_comments(@comments_url)
70
+ end
71
+ def each
72
+ comments.each do |e| yield(e) ; end
73
+ end
74
+ def retrieve_comments url
75
+ raise "Parent must be set in order to retrieve comments " unless @parent
76
+ @parent._retrieve_comments url
77
+ end
78
+ alias :each_comment :each
79
+ def [](sym)
80
+ @hash[sym]
81
+ end
82
+ def keys
83
+ @hash.keys
84
+ end
85
+ def values
86
+ @hash.values
87
+ end
88
+ end
89
+ class ForumComment
90
+ attr_accessor :submitter, :submitter_url
91
+ attr_accessor :age, :age_text, :points, :head
92
+ attr_accessor :comment_text
93
+ attr_accessor :comment_url
94
+ attr_reader :hash
95
+ def initialize h
96
+
97
+ @hash = h
98
+ [:points, :comment_url, :age_text, :age,
99
+ :submitter, :submitter_url, :comment_text, :head].each do |sym|
100
+ instance_variable_set("@#{sym.to_s}", h[sym])
101
+ end
102
+ end
103
+ def [](sym)
104
+ @hash[sym]
105
+ end
106
+ def keys
107
+ @hash.keys
108
+ end
109
+ def values
110
+ @hash.values
111
+ end
112
+ end
113
+
114
+ #
115
+ # rn = RNParser.new [url]
116
+ # rn.subreddit = "ruby"
117
+ # resultset = rn.get_next_page :page => prevresultset, :number => 5
118
+ # resultset.each do |art|
119
+ # art.title, art.points
120
+ # art.comments
121
+ # end
122
+ #
123
+ # hn = HNewsParser @options
124
+ # hn.subxxx = "news" / "newest"
125
+ #
126
+ # redditnews.rb -s ruby --pages 2
127
+ # hackernews.rb -s newest --pages 2 -d '|'
128
+ #
129
+
130
+ class AbstractSiteParser
131
+ attr_reader :more_url
132
+ attr_accessor :host
133
+ attr_accessor :num_pages
134
+ attr_accessor :subforum
135
+ # should the html be saved
136
+ attr_accessor :save_html
137
+ attr_accessor :htmloutfile
138
+ #HOST = "https://news.ycombinator.com"
139
+ def initialize options={}
140
+ @options = options
141
+ @url = @options[:url]
142
+ @save_html = @options[:save_html]
143
+ @htmloutfile = @options[:htmloutfile]
144
+ @num_pages = @options[:num_pages] || 1
145
+ @more_url = nil
146
+ #puts "initialize: url is #{@url} "
147
+ end
148
+ def get_first_page
149
+ #@arr = to_hash @url
150
+ page = _retrieve_page @url
151
+ end
152
+ def get_next_page opts={}
153
+ page = opts[:page]
154
+ num_pages = opts[:num_pages] || @num_pages
155
+ num_pages ||= 1
156
+ u = @more_url || @url
157
+ if page
158
+ u = page.next_url
159
+ end
160
+ pages = nil
161
+ num_pages.times do |i|
162
+ page = _retrieve_page u
163
+ if pages.nil?
164
+ pages = page
165
+ else
166
+ pages.merge_page page
167
+ end
168
+ u = page.next_url
169
+ break unless u # sometimes there is no next
170
+ @more_url = u
171
+ end
172
+ return pages
173
+ end
174
+ alias :get_next :get_next_page
175
+ def _retrieve_page url
176
+ raise "must be implemented by concrete class"
177
+ end
178
+ # write as yml, this doesn't work if multiple pages since we call x times
179
+ # so previous is overwritten
180
+ # This should be called with final class
181
+ def to_yml outfile, arr = @arr
182
+ require 'yaml'
183
+ # cannot just convert / to __ in filename since path gets converted too
184
+ #if outfile.index("/")
185
+ #outfile = outfile.gsub("/","__")
186
+ #end
187
+ File.open(outfile, 'w' ) do |f|
188
+ f << YAML::dump(arr)
189
+ end
190
+ end
191
+ # after called get_next_page, one may pass its return value
192
+ # to this method to convert it into an array of hashes and store it as a yml file
193
+ # It's a bit silly, first we break the hash down into this structure
194
+ # and then deconstruct the whole thing.
195
+ def save_page_as_yml outputfile, page
196
+ h = {}
197
+ h[:url] = page.url
198
+ h[:next_url] = page.next_url
199
+ h[:subforum] = page.subforum
200
+ h[:create_date] = page.create_date
201
+ articles = []
202
+ page.each do |a| articles << a.hash; end
203
+
204
+ h[:articles] = articles
205
+
206
+ to_yml outputfile, h
207
+ end
208
+ # retrieves the comments for a url and stores in outputfile in YML format
209
+ def save_comments_as_yml outputfile, url
210
+ pages = _retrieve_comments url
211
+ if pages
212
+ to_yml outputfile, pages.hash
213
+ end
214
+ end
215
+ # returns nokogiri html doc and writes html is required.
216
+ def get_doc_for_url url
217
+ #puts "get_doc #{url} "
218
+ out = open(url)
219
+ doc = Nokogiri::HTML(out)
220
+ if @save_html
221
+ subforum = @subforum || "unknown"
222
+ outfile = @htmloutfile || "#{subforum}.html"
223
+ #if !File.exists? url
224
+ out.rewind
225
+ File.open(outfile, 'w') {|f| f.write(out.read) }
226
+ #end
227
+ end
228
+ return doc
229
+ end
230
+ # this is a test method so we don't keep hitting HN while testing out and getting IP blocked.
231
+ def load_from_yml filename="hn.yml"
232
+ @arr = YAML::load( File.open( filename ) )
233
+ next_url = @arr.last[:article_url]
234
+ unless next_url.index("http")
235
+ next_url = @host + "/" + next_url
236
+ end
237
+ @more_url = next_url
238
+ end
239
+ def _retrieve_comments url
240
+ raise "Must be implemented by concrete class "
241
+ end
242
+ public
243
+ def get_comments_url index
244
+ arr = @arr
245
+ entry = arr[index]
246
+ if entry
247
+ if entry.key? :comments_url
248
+ return entry[:comments_url]
249
+ end
250
+ end
251
+ return nil
252
+ end
253
+ public
254
+ def get_comments index
255
+ url = get_comments_url index
256
+ if url
257
+ #puts url
258
+ comments = convert_comment_url url
259
+ return comments
260
+ #else
261
+ #puts "Sorry no url for #{index} "
262
+ end
263
+ return []
264
+ end
265
+ alias :get_comments_for_link :get_comments
266
+ def human_age_to_unix age_text
267
+ i = age_text.to_i
268
+ ff=1
269
+ if age_text.index("hour")
270
+ i *= ff*60*60
271
+ elsif age_text.index("second")
272
+ i *= ff
273
+ elsif age_text.index("minute")
274
+ i *= ff*60
275
+ elsif age_text.index("day")
276
+ i *= ff*60*60*24
277
+ elsif age_text.index("month")
278
+ i *= ff*60*60*24*30
279
+ elsif age_text.index("week")
280
+ i *= ff*60*60*24*7
281
+ elsif age_text.index("year")
282
+ i *= ff*60*60*24*365
283
+ else
284
+ #raise "don't know how to convert #{age_text} "
285
+ return 0
286
+ end
287
+ return (Time.now.to_i - i)
288
+ end
289
+ end
290
+ end
291
+ include HackerCurse
292
+
293
+
294
+ if __FILE__ == $0
295
+ #rn = HackerNewsParser.new :url => "hackernews.html"
296
+ rn = RedditNewsParser.new :url => "reddit-prog.html"
297
+
298
+ page = rn.get_next_page # [page if supplied, take page.next_url, otherwise store??]
299
+ puts "For each article :::"
300
+ page.each do |art|
301
+ puts art.title, art.points, art.age_text, art.age, Time.at(art.age)
302
+ end # each_article
303
+ art = page.articles.first
304
+ puts "PRINTING comments "
305
+ art.each_comment do |c|
306
+ puts
307
+ puts " ======"
308
+ puts c.head
309
+ s = nil
310
+ if c.age
311
+ s = Time.at(c.age)
312
+ end
313
+ puts " #{c.age_text} | #{c.submitter} | #{c.age} . #{s} "
314
+ puts c.comment_text
315
+ end
316
+
317
+ exit
318
+ articles = page.articles
319
+ co = articles.first.comments
320
+ puts "PRINTING comments "
321
+ puts co[:title], co[:subtext]
322
+ comments = co[:comments]
323
+ comments.each_with_index do |c,i|
324
+ puts "======= #{c[:head]} : "
325
+ puts " - #{c[:head]} : "
326
+ puts " #{c[:comment]} "
327
+ puts " "
328
+ end
329
+
330
+ #comments.each_with_index do |c,i|
331
+ #puts " #{i}: #{c} "
332
+ #end
333
+ exit
334
+ art.each_comment do |cc|
335
+ end
336
+ #rn.next_url = page.next_url
337
+ rn.set_next_url(page)
338
+ #arr = rn.convert_comment_url "hn_comments.html"
339
+ #rn.to_yml "hn_comments.yml", arr
340
+
341
+
342
+ arr = rn.get_next_page
343
+ rn.to_yml "hn.yml"
344
+ puts "getting comments for link 1"
345
+ comments = rn.get_comments_for_link 1
346
+ if comments.empty?
347
+ comments = rn.get_comments_for_link 9
348
+ end
349
+ rn.to_yml "hn-comments.yml", comments
350
+ puts "getting next page"
351
+ arr1 = rn.get_next_page
352
+ rn.to_yml "hn-1.yml", arr1
353
+ end
@@ -0,0 +1,226 @@
1
+ require 'hacker/curse/abstractsiteparser'
2
+
3
+ module HackerCurse
4
+
5
+ class HackerNewsParser < AbstractSiteParser
6
+ def initialize config={}
7
+ @host = config[:host] || "https://news.ycombinator.com"
8
+ subforum = config[:subforum] || "news"
9
+ _url="#{@host}/#{subforum}"
10
+ @subforum = subforum
11
+ config[:url] ||= _url
12
+ super config
13
+ end
14
+ def _retrieve_page url
15
+ #puts "got url #{url} "
16
+ raise "url should be string" unless url.is_a? String
17
+ arr = to_hash url
18
+ page = hash_to_class arr
19
+ #to_yml "#{@subforum}.yml", arr
20
+ return page
21
+ end
22
+ # currently returns a Hash. containing various entries relating to the main article
23
+ # which can be avoiced.
24
+ # Contains an array :comments which contains hashes, :head contains text of head, :comment contains
25
+ # text of comment, and then there are entries for submitter.
26
+ # hash[:comments].each do |e| e[:comment] ; end
27
+ # @return Array of ForumComment objects.
28
+ # pages.each do |co| puts co.comment_text, co.head; end
29
+ def _retrieve_comments url
30
+ arr = to_hash_comment url
31
+ # TODO break head into points age etc
32
+ pages = hash_to_comment_class arr
33
+ return pages
34
+ end
35
+ def hash_to_comment_class arr
36
+ page = ForumArticle.new arr
37
+ return page
38
+ end
39
+ def oldhash_to_comment_class arr
40
+ co = arr[:comments]
41
+ pages = Array.new
42
+ co.each do |h|
43
+ page = ForumComment.new h
44
+ pages << page
45
+ end
46
+ return pages
47
+ end
48
+ def to_hash_comment url
49
+ # for testing i may send in a saved file, so i don't keep hitting HN
50
+ if !File.exists? url
51
+ unless url.index("http")
52
+ url = @host + "/" + url
53
+ end
54
+ end
55
+ page = Nokogiri::HTML(open(url))
56
+ h = {}
57
+ title = page.css("td.title")
58
+ article_url = title.css("a").first["href"]
59
+ h[:title] = title.text
60
+ h[:article_url] = article_url
61
+
62
+ subtext = page.css("td.subtext")
63
+ h[:byline] = subtext.text
64
+ # TODO extract age_text
65
+ h[:age_text] = subtext.text.scan(/\d+ \w+ ago/).first
66
+ score = subtext.css("span").text
67
+ h[:points] = score
68
+ subtext.css("a").each_with_index do |e, i|
69
+ link = e["href"]
70
+ text = e.text
71
+ if link.index("user") == 0
72
+ h[:submitter] = text
73
+ h[:submitter_url] = link
74
+ elsif link.index("item") == 0
75
+ h[:comment_count] = text
76
+ h[:comments_url] = link
77
+ end
78
+ end
79
+
80
+ # need to get points
81
+ comheads = page.css("span.comhead") # .collect do |e| e.text ; end
82
+ comments = page.css("span.comment").collect do |e| e.text ; end
83
+ comheads.delete(comheads.first)
84
+ # array of comments
85
+ carr = Array.new
86
+ comheads.zip(comments) do |head,c|
87
+ hh={}; hh[:head] = head.text;
88
+ #$stderr.puts "head:: #{head.text}"
89
+ m = head.text.scan(/\d+ \w+ ago/)
90
+ if !m.empty?
91
+ hh[:age_text] = m.first.scan(/\d+ \w/).first.rjust(4)
92
+ hh[:age] = human_age_to_unix(m.first)
93
+ head.css("a").each_with_index do |e, i|
94
+ link = e["href"]
95
+ text = e.text
96
+ if link.index("user") == 0
97
+ hh[:submitter] = text
98
+ hh[:submitter_url] = link
99
+ elsif link.index("item") == 0
100
+ hh[:text] = text
101
+ hh[:comment_url] = link
102
+ end
103
+ end
104
+ end
105
+ hh[:comment_text]=c;
106
+ carr << hh
107
+ end
108
+
109
+ h[:comments] = carr
110
+ return h
111
+ end
112
+ def hash_to_class h
113
+ p = ForumPage.new
114
+ p.url = h[:url]
115
+ p.next_url = h[:next_url]
116
+ p.create_date = h[:create_date]
117
+ p.subforum = h[:subforum]
118
+ art = h[:articles]
119
+ arts = []
120
+ art.each do |a|
121
+ fa = ForumArticle.new a
122
+ fa.parent = self
123
+ arts << fa
124
+ end
125
+ p.articles = arts
126
+ return p
127
+ end
128
+ # convert the front page to a hash
129
+ def to_hash url
130
+ doc = get_doc_for_url url
131
+ count = 0
132
+ page = {}
133
+ page[:url] = url
134
+ now = Time.now
135
+ page[:create_date_seconds] = now.to_i
136
+ page[:create_date] = now
137
+ page[:subforum] = @subforum
138
+
139
+ arr = Array.new
140
+ h = {}
141
+ links = doc.xpath("//table/tr/td/table/tr")
142
+ links.each_with_index do |li, i|
143
+ x = li.css("td.title")
144
+ if !x.empty?
145
+ #puts " ---- title ----- #{x.count} "
146
+ count = x[0].text
147
+ #puts count
148
+ if x.count < 2
149
+ # this block is for the next_url
150
+ article_url = x[0].css("a")[0]["href"] # link url
151
+ #puts article_url
152
+ h = {}
153
+ h[:title] = count
154
+ h[:article_url] = article_url
155
+ more = count
156
+ more_url = "#{@host}/#{article_url}"
157
+ #arr << h
158
+ page[:next_url] = more_url
159
+ #puts li
160
+ end
161
+ break if x.count < 2
162
+
163
+ # actual article url
164
+ title = x[1].css("a")[0].text # title
165
+ article_url = x[1].css("a")[0]["href"] # link url
166
+ #puts article_url
167
+ #puts title
168
+ h = {}
169
+ #h[:number] = count
170
+ h[:title] = title
171
+ # ask option does not have hostname since it is relative to HN
172
+ if article_url.index("http") != 0
173
+ article_url = "#{@host}/#{article_url}"
174
+ end
175
+
176
+ h[:article_url] = article_url
177
+ arr << h
178
+ else
179
+ x = li.css("td.subtext")
180
+ if !x.empty?
181
+ fulltext = x.text
182
+ #puts " ---- subtext ----- (#{fulltext})"
183
+ submitter = nil
184
+ submitter_url = nil
185
+ comment = nil
186
+ comments_url = nil
187
+ t = x.css("a")
188
+ t.each_with_index do |tt, ii|
189
+ case ii
190
+ when 0
191
+ submitter = tt.text
192
+ submitter_url = tt["href"]
193
+ when 1
194
+ comment = tt.text
195
+ comments_url = tt["href"]
196
+ comments_url = "#{@host}/#{comments_url}"
197
+ end
198
+ end
199
+ points = x.css("span").text rescue ""
200
+ #puts submitter
201
+ #puts submitter_url
202
+ #puts comment
203
+ #puts comments_url
204
+ #puts points
205
+ h[:submitter] = submitter
206
+ h[:submitter_url] = submitter_url
207
+ h[:comment_count] = comment.to_i.to_s.rjust(4)
208
+ h[:comments_url] = comments_url
209
+ h[:points] = points.to_i.to_s.rjust(4)
210
+ m = fulltext.scan(/\d+ \w+ ago/)
211
+ if m
212
+ #h[:age_text] = m.first
213
+ h[:age_text] = m.first.scan(/\d+ \w/).first.rjust(4)
214
+ h[:age] = human_age_to_unix(m.first)
215
+ end
216
+ #puts "fulltext: #{fulltext} "
217
+ h[:byline] = fulltext
218
+ end
219
+ end
220
+ end
221
+ #return arr
222
+ page[:articles] = arr
223
+ return page
224
+ end
225
+ end # class
226
+ end # module