twitterscraper-ruby 0.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,48 @@
1
+ module Twitterscraper
2
+ module Template
3
+ module_function
4
+
5
+ def tweets_embedded_html(tweets)
6
+ tweets_html = tweets.map { |t| EMBED_TWEET_HTML.sub('__TWEET_URL__', t.tweet_url) }
7
+ EMBED_TWEETS_HTML.sub('__TWEETS__', tweets_html.join)
8
+ end
9
+
10
+ EMBED_TWEET_HTML = <<~'HTML'
11
+ <blockquote class="twitter-tweet">
12
+ <a href="__TWEET_URL__"></a>
13
+ </blockquote>
14
+ HTML
15
+
16
+ EMBED_TWEETS_HTML = <<~'HTML'
17
+ <html>
18
+ <head>
19
+ <style type=text/css>
20
+ .twitter-tweet {
21
+ margin: 30px auto 0 auto !important;
22
+ }
23
+ </style>
24
+ <script>
25
+ window.twttr = (function(d, s, id) {
26
+ var js, fjs = d.getElementsByTagName(s)[0], t = window.twttr || {};
27
+ if (d.getElementById(id)) return t;
28
+ js = d.createElement(s);
29
+ js.id = id;
30
+ js.src = "https://platform.twitter.com/widgets.js";
31
+ fjs.parentNode.insertBefore(js, fjs);
32
+
33
+ t._e = [];
34
+ t.ready = function(f) {
35
+ t._e.push(f);
36
+ };
37
+
38
+ return t;
39
+ }(document, "script", "twitter-wjs"));
40
+ </script>
41
+ </head>
42
+ <body>
43
+ __TWEETS__
44
+ </body>
45
+ </html>
46
+ HTML
47
+ end
48
+ end
@@ -0,0 +1,123 @@
1
+ require 'time'
2
+
3
+ module Twitterscraper
4
+ class Tweet
5
+ KEYS = [
6
+ :screen_name,
7
+ :name,
8
+ :user_id,
9
+ :tweet_id,
10
+ :text,
11
+ :links,
12
+ :hashtags,
13
+ :image_urls,
14
+ :video_url,
15
+ :has_media,
16
+ :likes,
17
+ :retweets,
18
+ :replies,
19
+ :is_replied,
20
+ :is_reply_to,
21
+ :parent_tweet_id,
22
+ :reply_to_users,
23
+ :tweet_url,
24
+ :timestamp,
25
+ :created_at,
26
+ ]
27
+ attr_reader *KEYS
28
+
29
+ def initialize(attrs)
30
+ attrs.each do |key, value|
31
+ instance_variable_set("@#{key}", value)
32
+ end
33
+ end
34
+
35
+ def attrs
36
+ KEYS.map do |key|
37
+ [key, send(key)]
38
+ end.to_h
39
+ end
40
+
41
+ def to_json(options = {})
42
+ attrs.to_json
43
+ end
44
+
45
+ class << self
46
+ def from_json(text)
47
+ json = JSON.parse(text)
48
+ json.map do |tweet|
49
+ tweet['created_at'] = Time.parse(tweet['created_at'])
50
+ new(tweet)
51
+ end
52
+ end
53
+
54
+ def from_html(text)
55
+ html = Nokogiri::HTML(text)
56
+ from_tweets_html(html.xpath("//li[@class[contains(., 'js-stream-item')]]/div[@class[contains(., 'js-stream-tweet')]]"))
57
+ end
58
+
59
+ def from_tweets_html(html)
60
+ html.map do |tweet|
61
+ from_tweet_html(tweet)
62
+ end.compact
63
+ end
64
+
65
+ def from_tweet_html(html)
66
+ screen_name = html.attr('data-screen-name')
67
+ tweet_id = html.attr('data-tweet-id')&.to_i
68
+
69
+ unless html.to_s.include?('js-tweet-text-container')
70
+ Twitterscraper.logger.warn "html doesn't include div.js-tweet-text-container url=https://twitter.com/#{screen_name}/status/#{tweet_id}"
71
+ return nil
72
+ end
73
+
74
+ inner_html = Nokogiri::HTML(html.inner_html)
75
+ text = inner_html.xpath("//div[@class[contains(., 'js-tweet-text-container')]]/p[@class[contains(., 'js-tweet-text')]]").first.text
76
+ links = inner_html.xpath("//a[@class[contains(., 'twitter-timeline-link')]]").map { |elem| elem.attr('data-expanded-url') }.select { |link| link && !link.include?('pic.twitter') }
77
+ image_urls = inner_html.xpath("//div[@class[contains(., 'AdaptiveMedia-photoContainer')]]").map { |elem| elem.attr('data-image-url') }
78
+ video_url = inner_html.xpath("//div[@class[contains(., 'PlayableMedia-container')]]/a").map { |elem| elem.attr('href') }[0]
79
+ has_media = !image_urls.empty? || (video_url && !video_url.empty?)
80
+
81
+ actions = inner_html.xpath("//div[@class[contains(., 'ProfileTweet-actionCountList')]]")
82
+ likes = actions.xpath("//span[@class[contains(., 'ProfileTweet-action--favorite')]]/span[@class[contains(., 'ProfileTweet-actionCount')]]").first.attr('data-tweet-stat-count').to_i || 0
83
+ retweets = actions.xpath("//span[@class[contains(., 'ProfileTweet-action--retweet')]]/span[@class[contains(., 'ProfileTweet-actionCount')]]").first.attr('data-tweet-stat-count').to_i || 0
84
+ replies = actions.xpath("//span[@class[contains(., 'ProfileTweet-action--reply u-hiddenVisually')]]/span[@class[contains(., 'ProfileTweet-actionCount')]]").first.attr('data-tweet-stat-count').to_i || 0
85
+ is_replied = replies != 0
86
+
87
+ parent_tweet_id = inner_html.xpath('//*[@data-conversation-id]').first.attr('data-conversation-id').to_i
88
+ if tweet_id == parent_tweet_id
89
+ is_reply_to = false
90
+ parent_tweet_id = nil
91
+ reply_to_users = []
92
+ else
93
+ is_reply_to = true
94
+ reply_to_users = inner_html.xpath("//div[@class[contains(., 'ReplyingToContextBelowAuthor')]]/a").map { |user| {screen_name: user.text.delete_prefix('@'), user_id: user.attr('data-user-id')} }
95
+ end
96
+
97
+ timestamp = inner_html.xpath("//span[@class[contains(., 'js-short-timestamp')]]").first.attr('data-time').to_i
98
+ new(
99
+ screen_name: screen_name,
100
+ name: html.attr('data-name'),
101
+ user_id: html.attr('data-user-id').to_i,
102
+ tweet_id: tweet_id,
103
+ text: text,
104
+ links: links,
105
+ hashtags: text.scan(/#\w+/).map { |tag| tag.delete_prefix('#') },
106
+ image_urls: image_urls,
107
+ video_url: video_url,
108
+ has_media: has_media,
109
+ likes: likes,
110
+ retweets: retweets,
111
+ replies: replies,
112
+ is_replied: is_replied,
113
+ is_reply_to: is_reply_to,
114
+ parent_tweet_id: parent_tweet_id,
115
+ reply_to_users: reply_to_users,
116
+ tweet_url: 'https://twitter.com' + html.attr('data-permalink-path'),
117
+ timestamp: timestamp,
118
+ created_at: Time.at(timestamp, in: '+00:00'),
119
+ )
120
+ end
121
+ end
122
+ end
123
+ end
@@ -0,0 +1,3 @@
1
+ module Twitterscraper
2
+ VERSION = '0.15.0'
3
+ end
@@ -0,0 +1,31 @@
1
+ require_relative 'lib/version'
2
+
3
+ Gem::Specification.new do |spec|
4
+ spec.name = "twitterscraper-ruby"
5
+ spec.version = Twitterscraper::VERSION
6
+ spec.authors = ["ts-3156"]
7
+ spec.email = ["ts_3156@yahoo.co.jp"]
8
+
9
+ spec.summary = %q{A gem to scrape Tweets}
10
+ spec.description = %q{A gem to scrape Tweets}
11
+ spec.homepage = "https://github.com/ts-3156/twitterscraper-ruby"
12
+ spec.license = "MIT"
13
+ spec.required_ruby_version = Gem::Requirement.new(">= 2.3.0")
14
+
15
+ spec.metadata["homepage_uri"] = spec.homepage
16
+ spec.metadata["source_code_uri"] = spec.homepage
17
+ spec.metadata["changelog_uri"] = spec.homepage
18
+
19
+ # Specify which files should be added to the gem when it is released.
20
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
21
+ spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
22
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
23
+ end
24
+ spec.executables = ["twitterscraper"]
25
+ spec.require_paths = ["lib"]
26
+
27
+ spec.required_ruby_version = ">= 2.6.4"
28
+
29
+ spec.add_dependency "nokogiri"
30
+ spec.add_dependency "parallel"
31
+ end
metadata ADDED
@@ -0,0 +1,104 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: twitterscraper-ruby
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.15.0
5
+ platform: ruby
6
+ authors:
7
+ - ts-3156
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2020-07-17 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: parallel
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ description: A gem to scrape Tweets
42
+ email:
43
+ - ts_3156@yahoo.co.jp
44
+ executables:
45
+ - twitterscraper
46
+ extensions: []
47
+ extra_rdoc_files: []
48
+ files:
49
+ - ".circleci/config.yml"
50
+ - ".gitignore"
51
+ - ".irbrc"
52
+ - ".rspec"
53
+ - ".ruby-version"
54
+ - ".travis.yml"
55
+ - CODE_OF_CONDUCT.md
56
+ - Gemfile
57
+ - Gemfile.lock
58
+ - LICENSE.txt
59
+ - README.md
60
+ - Rakefile
61
+ - bin/console
62
+ - bin/setup
63
+ - bin/twitterscraper
64
+ - lib/twitterscraper-ruby.rb
65
+ - lib/twitterscraper.rb
66
+ - lib/twitterscraper/cache.rb
67
+ - lib/twitterscraper/cli.rb
68
+ - lib/twitterscraper/client.rb
69
+ - lib/twitterscraper/http.rb
70
+ - lib/twitterscraper/lang.rb
71
+ - lib/twitterscraper/logger.rb
72
+ - lib/twitterscraper/proxy.rb
73
+ - lib/twitterscraper/query.rb
74
+ - lib/twitterscraper/template.rb
75
+ - lib/twitterscraper/tweet.rb
76
+ - lib/version.rb
77
+ - twitterscraper-ruby.gemspec
78
+ homepage: https://github.com/ts-3156/twitterscraper-ruby
79
+ licenses:
80
+ - MIT
81
+ metadata:
82
+ homepage_uri: https://github.com/ts-3156/twitterscraper-ruby
83
+ source_code_uri: https://github.com/ts-3156/twitterscraper-ruby
84
+ changelog_uri: https://github.com/ts-3156/twitterscraper-ruby
85
+ post_install_message:
86
+ rdoc_options: []
87
+ require_paths:
88
+ - lib
89
+ required_ruby_version: !ruby/object:Gem::Requirement
90
+ requirements:
91
+ - - ">="
92
+ - !ruby/object:Gem::Version
93
+ version: 2.6.4
94
+ required_rubygems_version: !ruby/object:Gem::Requirement
95
+ requirements:
96
+ - - ">="
97
+ - !ruby/object:Gem::Version
98
+ version: '0'
99
+ requirements: []
100
+ rubygems_version: 3.0.3
101
+ signing_key:
102
+ specification_version: 4
103
+ summary: A gem to scrape Tweets
104
+ test_files: []