reddit_get 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 10171c0f2446ba66686862c922b20a8bcd9212f15c3134e0af72652fa4e6f0dd
4
+ data.tar.gz: 61563a46f484e8dc9125febbb68c8e988774506b6037beb784c54e686c42dbed
5
+ SHA512:
6
+ metadata.gz: 602c9bdd9377205c787d6c573ac27d14781b30c1505fc5ac2beabde6a607fb27c7274745ee958567612c2413bc231c20dbd9f7b352117f4615be375baa73d17a
7
+ data.tar.gz: e32b9aeeb505c44f0d1573e0cdf33748cb0c203216a51a87e6fc006d82fd7503c0f398c9356a44914b357b8729381d57b92d78dfbb85893b54f844f4d8df19fe
data/.gitignore ADDED
@@ -0,0 +1,8 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /_yardoc/
4
+ /coverage/
5
+ /doc/
6
+ /pkg/
7
+ /spec/reports/
8
+ /tmp/
data/.rubocop.yml ADDED
@@ -0,0 +1,6 @@
1
+ AllCops:
2
+ NewCops: enable
3
+ Exclude:
4
+ - lib/scheduler.rb
5
+
6
+ require: rubocop-performance
data/Gemfile ADDED
@@ -0,0 +1,12 @@
1
+ # frozen_string_literal: true
2
+
3
+ source 'https://rubygems.org'
4
+
5
+ ruby '3.0.0'
6
+
7
+ gemspec
8
+
9
+ group :development do
10
+ gem 'rubocop'
11
+ gem 'rubocop-performance'
12
+ end
data/Gemfile.lock ADDED
@@ -0,0 +1,45 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ reddit_get (0.1.0)
5
+
6
+ GEM
7
+ remote: https://rubygems.org/
8
+ specs:
9
+ ast (2.4.2)
10
+ parallel (1.20.1)
11
+ parser (3.0.0.0)
12
+ ast (~> 2.4.1)
13
+ rainbow (3.0.0)
14
+ regexp_parser (2.0.3)
15
+ rexml (3.2.4)
16
+ rubocop (1.9.1)
17
+ parallel (~> 1.10)
18
+ parser (>= 3.0.0.0)
19
+ rainbow (>= 2.2.2, < 4.0)
20
+ regexp_parser (>= 1.8, < 3.0)
21
+ rexml
22
+ rubocop-ast (>= 1.2.0, < 2.0)
23
+ ruby-progressbar (~> 1.7)
24
+ unicode-display_width (>= 1.4.0, < 3.0)
25
+ rubocop-ast (1.4.1)
26
+ parser (>= 2.7.1.5)
27
+ rubocop-performance (1.9.2)
28
+ rubocop (>= 0.90.0, < 2.0)
29
+ rubocop-ast (>= 0.4.0)
30
+ ruby-progressbar (1.11.0)
31
+ unicode-display_width (2.0.0)
32
+
33
+ PLATFORMS
34
+ x86_64-darwin-18
35
+
36
+ DEPENDENCIES
37
+ reddit_get!
38
+ rubocop
39
+ rubocop-performance
40
+
41
+ RUBY VERSION
42
+ ruby 3.0.0p0
43
+
44
+ BUNDLED WITH
45
+ 2.2.3
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2021 Alessandro
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,67 @@
1
+ # RedditGet
2
+
3
+ This gem allows you to grab posts and comments from Reddit without any auth.
4
+ It concurrently grabs multiple subbreddits at a time to utilize your machine as much as possible and increase throughput.
5
+
6
+ No setup and a clean interface makes this gem ideal when you just want to process public reddit data.
7
+ Zero dependencies.
8
+
9
+ The [Redd gem](https://github.com/avinashbot/redd) seems to be abandoned so I created this gem to meet my needs.
10
+
11
+ ## Installation
12
+
13
+ ```ruby
14
+ gem 'reddit_get'
15
+ ```
16
+
17
+ And then execute:
18
+
19
+ $ bundle install
20
+
21
+ Or install it yourself as:
22
+
23
+ $ gem install reddit_get
24
+
25
+ ## Usage
26
+
27
+ ### You want to grab many subreddits
28
+ ```ruby
29
+ results = RedditGet::Subreddit.collect_all %w[gaming videos movies funny]
30
+ results # will hold RedditGet::Data which acts like a hash
31
+ results['gaming'].each do |post|
32
+ puts post['title']
33
+ end
34
+ results.gaming
35
+ ```
36
+
37
+ ### You want to grab one subreddit
38
+ ```ruby
39
+ result = RedditGet::Subreddit.collect('gaming')
40
+
41
+ results.gaming.each do |post|
42
+ puts post.title # all gaming posts titles
43
+ end
44
+ results['gaming'] # works too!
45
+ ```
46
+
47
+ ## You want to grab comments for each post
48
+ ```ruby
49
+ results = RedditGet::Subreddit.collect_all %w[gaming videos movies funny], with_comments: true
50
+ results.gaming.each do |post|
51
+ puts post.title
52
+ post.comments.each do |comment|
53
+ puts comment.body
54
+ end
55
+ end
56
+
57
+ # also works with single subreddit
58
+ RedditGet::Subreddit.collect 'gaming', with_comments: true
59
+ ```
60
+
61
+ ## Contributing
62
+
63
+ Bug reports and pull requests are welcome on GitHub at https://github.com/AlessandroMinali/reddit_get.
64
+
65
+ ## License
66
+
67
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
data/lib/reddit_get.rb ADDED
@@ -0,0 +1,122 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'net/http'
4
+ require 'json'
5
+
6
+ require_relative 'scheduler'
7
+ require_relative 'reddit_get/version'
8
+
9
+ module RedditGet
10
+ class Error < StandardError; end
11
+
12
+ # Allow to use method call chains instead of Hash keys navigation
13
+ class Data
14
+ def initialize(data)
15
+ @data = data
16
+ end
17
+
18
+ def objectify(data)
19
+ data.transform_values! do |v|
20
+ case v
21
+ when Hash
22
+ Data.new(v)
23
+ when Array
24
+ v.map { |i| Data.new(i) }
25
+ else
26
+ v
27
+ end
28
+ end
29
+ end
30
+
31
+ def [](key)
32
+ @data.fetch(key)
33
+ end
34
+
35
+ def method_missing(method, *_args)
36
+ @data.send(method)
37
+ rescue NoMethodError
38
+ out = @data.fetch(method.to_s)
39
+ case out
40
+ when Hash
41
+ Data.new(out)
42
+ when Array
43
+ out.map { |i| Data.new(i) }
44
+ else
45
+ out
46
+ end
47
+ end
48
+
49
+ def respond_to_missing?(method)
50
+ @data.key?(method_name.to_s) || super
51
+ end
52
+ end
53
+
54
+ # Grab subreddit top page as json
55
+ class Subreddit
56
+ BASE_URL = 'https://old.reddit.com'
57
+
58
+ def self.collect_all(subreddits, with_comments: false)
59
+ raise TypeError, 'Must pass an array of subreddits' unless subreddits.is_a?(Array)
60
+
61
+ results = Hash[subreddits.zip([])]
62
+ subreddits.uniq.each do |subreddit|
63
+ grab_posts(results, subreddit, with_comments: with_comments)
64
+ end
65
+ scheduler_run
66
+ Data.new(results)
67
+ end
68
+
69
+ def self.collect(subreddit, with_comments: false)
70
+ collect_all([subreddit], with_comments: with_comments)
71
+ end
72
+
73
+ class << self
74
+ private
75
+
76
+ def scheduler_run
77
+ scheduler = Scheduler.new
78
+ Fiber.set_scheduler scheduler
79
+ scheduler.run
80
+ end
81
+
82
+ def grab_posts(results, subreddit, with_comments:)
83
+ Fiber.new do
84
+ results[subreddit] = get_reddit_posts(subreddit).map! do |post|
85
+ grab_comments(post) if with_comments
86
+ post['data']
87
+ end
88
+ end.resume
89
+ end
90
+
91
+ def grab_comments(post)
92
+ url = post['data']['permalink']
93
+ Fiber.new do
94
+ post['data']['comments'] = get_reddit_comments(url).map! do |comment|
95
+ comment['data']
96
+ end
97
+ end.resume
98
+ end
99
+
100
+ def get_reddit_posts(subreddit)
101
+ get_json(URI("#{BASE_URL}/r/#{subreddit}.json")).dig('data', 'children')
102
+ end
103
+
104
+ def get_reddit_comments(url)
105
+ get_json("#{BASE_URL}#{url}.json")[1].dig('data', 'children')
106
+ end
107
+
108
+ def get_json(uri)
109
+ req = Net::HTTP::Get.new(
110
+ uri,
111
+ { 'User-Agent':
112
+ 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0' }
113
+ )
114
+ body = Net::HTTP.start('old.reddit.com', 443, use_ssl: true) do |http|
115
+ http.request(req)
116
+ end.body
117
+
118
+ JSON.parse(body)
119
+ end
120
+ end
121
+ end
122
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RedditGet
4
+ VERSION = '0.1.0'
5
+ end
data/lib/scheduler.rb ADDED
@@ -0,0 +1,186 @@
1
+ # frozen_string_literal: true
2
+
3
+ # This is an example and simplified scheduler for test purposes.
4
+ # It is not efficient for a large number of file descriptors as it uses IO.select().
5
+ # Production Fiber schedulers should use epoll/kqueue/etc.
6
+
7
+ require 'fiber'
8
+ require 'socket'
9
+
10
+ begin
11
+ require 'io/nonblock'
12
+ rescue LoadError
13
+ # Ignore.
14
+ end
15
+
16
+ module RedditGet
17
+ class Scheduler
18
+ def initialize
19
+ @readable = {}
20
+ @writable = {}
21
+ @waiting = {}
22
+
23
+ @closed = false
24
+
25
+ @lock = Mutex.new
26
+ @blocking = 0
27
+ @ready = []
28
+
29
+ @urgent = IO.pipe
30
+ end
31
+
32
+ attr :readable, :writable, :waiting
33
+
34
+ def next_timeout
35
+ _fiber, timeout = @waiting.min_by { |_key, value| value }
36
+
37
+ if timeout
38
+ offset = timeout - current_time
39
+
40
+ if offset.negative?
41
+ 0
42
+ else
43
+ offset
44
+ end
45
+ end
46
+ end
47
+
48
+ def run
49
+ while @readable.any? || @writable.any? || @waiting.any? || @blocking.positive?
50
+ # Can only handle file descriptors up to 1024...
51
+ readable, writable = IO.select(@readable.keys + [@urgent.first], @writable.keys, [],
52
+ next_timeout)
53
+
54
+ # puts "readable: #{readable}" if readable&.any?
55
+ # puts "writable: #{writable}" if writable&.any?
56
+
57
+ readable&.each do |io|
58
+ if fiber = @readable.delete(io)
59
+ fiber.resume
60
+ elsif io == @urgent.first
61
+ @urgent.first.read_nonblock(1024)
62
+ end
63
+ end
64
+
65
+ writable&.each do |io|
66
+ if fiber = @writable.delete(io)
67
+ fiber.resume
68
+ end
69
+ end
70
+
71
+ if @waiting.any?
72
+ time = current_time
73
+ waiting = @waiting
74
+ @waiting = {}
75
+
76
+ waiting.each do |fiber, timeout|
77
+ if timeout <= time
78
+ fiber.resume
79
+ else
80
+ @waiting[fiber] = timeout
81
+ end
82
+ end
83
+ end
84
+
85
+ next unless @ready.any?
86
+
87
+ ready = nil
88
+
89
+ @lock.synchronize do
90
+ ready = @ready
91
+ @ready = []
92
+ end
93
+
94
+ ready.each(&:resume)
95
+ end
96
+ end
97
+
98
+ def close
99
+ raise 'Scheduler already closed!' if @closed
100
+
101
+ run
102
+ ensure
103
+ @urgent.each(&:close)
104
+ @urgent = nil
105
+
106
+ @closed = true
107
+
108
+ # We freeze to detect any unintended modifications after the scheduler is closed:
109
+ freeze
110
+ end
111
+
112
+ def closed?
113
+ @closed
114
+ end
115
+
116
+ def current_time
117
+ Process.clock_gettime(Process::CLOCK_MONOTONIC)
118
+ end
119
+
120
+ def process_wait(pid, flags)
121
+ # This is a very simple way to implement a non-blocking wait:
122
+ Thread.new do
123
+ Process::Status.wait(pid, flags)
124
+ end.value
125
+ end
126
+
127
+ def io_wait(io, events, _duration)
128
+ @readable[io] = Fiber.current unless (events & IO::READABLE).zero?
129
+
130
+ @writable[io] = Fiber.current unless (events & IO::WRITABLE).zero?
131
+
132
+ Fiber.yield
133
+ events
134
+ end
135
+
136
+ # Used for Kernel#sleep and Mutex#sleep
137
+ def kernel_sleep(duration = nil)
138
+ block(:sleep, duration)
139
+
140
+ true
141
+ end
142
+
143
+ # Used when blocking on synchronization (Mutex#lock, Queue#pop, SizedQueue#push, ...)
144
+ def block(_blocker, timeout = nil)
145
+ # $stderr.puts [__method__, blocker, timeout].inspect
146
+
147
+ if timeout
148
+ @waiting[Fiber.current] = current_time + timeout
149
+ begin
150
+ Fiber.yield
151
+ ensure
152
+ # Remove from @waiting in the case #unblock was called before the timeout expired:
153
+ @waiting.delete(Fiber.current)
154
+ end
155
+ else
156
+ @blocking += 1
157
+ begin
158
+ Fiber.yield
159
+ ensure
160
+ @blocking -= 1
161
+ end
162
+ end
163
+ end
164
+
165
+ # Used when synchronization wakes up a previously-blocked fiber (Mutex#unlock, Queue#push, ...).
166
+ # This might be called from another thread.
167
+ def unblock(_blocker, fiber)
168
+ # $stderr.puts [__method__, blocker, fiber].inspect
169
+
170
+ @lock.synchronize do
171
+ @ready << fiber
172
+ end
173
+
174
+ io = @urgent.last
175
+ io.write_nonblock('.')
176
+ end
177
+
178
+ def fiber(&block)
179
+ fiber = Fiber.new(blocking: false, &block)
180
+
181
+ fiber.resume
182
+
183
+ fiber
184
+ end
185
+ end
186
+ end
@@ -0,0 +1,30 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'lib/reddit_get/version'
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = 'reddit_get'
7
+ spec.version = RedditGet::VERSION
8
+ spec.authors = ['Alessandro']
9
+ spec.email = ['4143332+AlessandroMinali@users.noreply.github.com']
10
+
11
+ spec.summary = 'Simply grab subreddit posts and their comments'
12
+ spec.description = 'A clean interface to handle reddit data without auth.'
13
+ spec.homepage = 'https://github.com/AlessandroMinali/reddit_get'
14
+ spec.license = 'MIT'
15
+ spec.required_ruby_version = Gem::Requirement.new('>= 3.0.0')
16
+
17
+ spec.metadata['homepage_uri'] = spec.homepage
18
+ spec.metadata['source_code_uri'] = 'https://github.com/AlessandroMinali/reddit_get'
19
+
20
+ # Specify which files should be added to the gem when it is released.
21
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
22
+ spec.files = Dir.chdir(File.expand_path(__dir__)) do
23
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{\A(?:test|spec|features)/}) }
24
+ end
25
+ spec.bindir = 'exe'
26
+ spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
27
+ spec.require_paths = ['lib']
28
+
29
+ spec.add_development_dependency 'rubocop'
30
+ end
data/spec.rb ADDED
@@ -0,0 +1,35 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'lib/reddit_get'
4
+
5
+ # RedditGet#collect
6
+ expect = RedditGet::Subreddit.collect('gaming')
7
+ raise 'Not a hash' unless expect.is_a? RedditGet::Data
8
+ raise 'No results returned' unless expect.values.any?
9
+
10
+ begin
11
+ RedditGet::Subreddit.collect(['gaming'])
12
+ rescue URI::InvalidURIError
13
+ raise_error = true
14
+ ensure
15
+ raise 'Should fail' unless raise_error
16
+ end
17
+
18
+ # RedditGet#collect_all
19
+ expect = RedditGet::Subreddit.collect_all(%w[gaming videos])
20
+ raise 'Not a hash' unless expect.is_a? RedditGet::Data
21
+ raise 'No results returned' unless expect.values.any?
22
+
23
+ expect = RedditGet::Subreddit.collect_all(%w[gaming videos gaming])
24
+ raise 'Must remove dups' unless expect.keys.count == 2
25
+
26
+ begin
27
+ RedditGet::Subreddit.collect_all('gaming')
28
+ rescue TypeError
29
+ raise_error = true
30
+ ensure
31
+ raise 'Should fail' unless raise_error
32
+ end
33
+
34
+ expect = RedditGet::Subreddit.collect_all(%w[gaming videos], with_comments: true)
35
+ raise 'Must have comments' unless expect.gaming.all? { |i| i.comments.any? }
metadata ADDED
@@ -0,0 +1,70 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: reddit_get
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Alessandro
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2021-02-13 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rubocop
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ description: A clean interface to handle reddit data without auth.
28
+ email:
29
+ - 4143332+AlessandroMinali@users.noreply.github.com
30
+ executables: []
31
+ extensions: []
32
+ extra_rdoc_files: []
33
+ files:
34
+ - ".gitignore"
35
+ - ".rubocop.yml"
36
+ - Gemfile
37
+ - Gemfile.lock
38
+ - LICENSE.txt
39
+ - README.md
40
+ - lib/reddit_get.rb
41
+ - lib/reddit_get/version.rb
42
+ - lib/scheduler.rb
43
+ - reddit_get.gemspec
44
+ - spec.rb
45
+ homepage: https://github.com/AlessandroMinali/reddit_get
46
+ licenses:
47
+ - MIT
48
+ metadata:
49
+ homepage_uri: https://github.com/AlessandroMinali/reddit_get
50
+ source_code_uri: https://github.com/AlessandroMinali/reddit_get
51
+ post_install_message:
52
+ rdoc_options: []
53
+ require_paths:
54
+ - lib
55
+ required_ruby_version: !ruby/object:Gem::Requirement
56
+ requirements:
57
+ - - ">="
58
+ - !ruby/object:Gem::Version
59
+ version: 3.0.0
60
+ required_rubygems_version: !ruby/object:Gem::Requirement
61
+ requirements:
62
+ - - ">="
63
+ - !ruby/object:Gem::Version
64
+ version: '0'
65
+ requirements: []
66
+ rubygems_version: 3.2.3
67
+ signing_key:
68
+ specification_version: 4
69
+ summary: Simply grab subreddit posts and their comments
70
+ test_files: []