twitterscraper-ruby 0.7.0 → 0.8.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +3 -0
- data/lib/twitterscraper/tweet.rb +59 -4
- data/lib/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5a0a2d55fac0a72e83d696c088daa6ca84b7b13519fbbe7a259dd1979373039a
|
4
|
+
data.tar.gz: a6cf2a0793f05d03d8d9b489eba985a244c7dce9f70e935d03207a7e103d6365
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3b4ca939b22a48fc53e1c1cb9ea25f55cdd6f8a53eb26fa1733948a8df44cd46fa51884668a70bbc31e85c4b986172d23995633557644b5ea93d7640b4034cf9
|
7
|
+
data.tar.gz: 9b1d61933990c916734fc6722bc12e6fdda513c4532edcb86982feabc30dabeaa13f39db03c8555fb8ddaa2aafc0493cb88069fbc374515737ed1465522f153b
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
data/lib/twitterscraper/tweet.rb
CHANGED
@@ -2,7 +2,27 @@ require 'time'
|
|
2
2
|
|
3
3
|
module Twitterscraper
|
4
4
|
class Tweet
|
5
|
-
KEYS = [
|
5
|
+
KEYS = [
|
6
|
+
:screen_name,
|
7
|
+
:name,
|
8
|
+
:user_id,
|
9
|
+
:tweet_id,
|
10
|
+
:text,
|
11
|
+
:links,
|
12
|
+
:hashtags,
|
13
|
+
:image_urls,
|
14
|
+
:video_url,
|
15
|
+
:has_media,
|
16
|
+
:likes,
|
17
|
+
:retweets,
|
18
|
+
:replies,
|
19
|
+
:is_replied,
|
20
|
+
:is_reply_to,
|
21
|
+
:parent_tweet_id,
|
22
|
+
:reply_to_users,
|
23
|
+
:tweet_url,
|
24
|
+
:created_at,
|
25
|
+
]
|
6
26
|
attr_reader *KEYS
|
7
27
|
|
8
28
|
def initialize(attrs)
|
@@ -31,15 +51,50 @@ module Twitterscraper
|
|
31
51
|
|
32
52
|
def from_tweet_html(html)
|
33
53
|
inner_html = Nokogiri::HTML(html.inner_html)
|
34
|
-
|
54
|
+
tweet_id = html.attr('data-tweet-id').to_i
|
55
|
+
text = inner_html.xpath("//div[@class[contains(., 'js-tweet-text-container')]]/p[@class[contains(., 'js-tweet-text')]]").first.text
|
56
|
+
links = inner_html.xpath("//a[@class[contains(., 'twitter-timeline-link')]]").map { |elem| elem.attr('data-expanded-url') }.select { |link| link && !link.include?('pic.twitter') }
|
57
|
+
image_urls = inner_html.xpath("//div[@class[contains(., 'AdaptiveMedia-photoContainer')]]").map { |elem| elem.attr('data-image-url') }
|
58
|
+
video_url = inner_html.xpath("//div[@class[contains(., 'PlayableMedia-container')]]/a").map { |elem| elem.attr('href') }[0]
|
59
|
+
has_media = !image_urls.empty? || (video_url && !video_url.empty?)
|
60
|
+
|
61
|
+
actions = inner_html.xpath("//div[@class[contains(., 'ProfileTweet-actionCountList')]]")
|
62
|
+
likes = actions.xpath("//span[@class[contains(., 'ProfileTweet-action--favorite')]]/span[@class[contains(., 'ProfileTweet-actionCount')]]").first.attr('data-tweet-stat-count').to_i || 0
|
63
|
+
retweets = actions.xpath("//span[@class[contains(., 'ProfileTweet-action--retweet')]]/span[@class[contains(., 'ProfileTweet-actionCount')]]").first.attr('data-tweet-stat-count').to_i || 0
|
64
|
+
replies = actions.xpath("//span[@class[contains(., 'ProfileTweet-action--reply u-hiddenVisually')]]/span[@class[contains(., 'ProfileTweet-actionCount')]]").first.attr('data-tweet-stat-count').to_i || 0
|
65
|
+
is_replied = replies != 0
|
66
|
+
|
67
|
+
parent_tweet_id = inner_html.xpath('//*[@data-conversation-id]').first.attr('data-conversation-id').to_i
|
68
|
+
if tweet_id == parent_tweet_id
|
69
|
+
is_reply_to = false
|
70
|
+
parent_tweet_id = nil
|
71
|
+
reply_to_users = []
|
72
|
+
else
|
73
|
+
is_reply_to = true
|
74
|
+
reply_to_users = inner_html.xpath("//div[@class[contains(., 'ReplyingToContextBelowAuthor')]]/a").map { |user| {screen_name: user.text.delete_prefix('@'), user_id: user.attr('data-user-id')} }
|
75
|
+
end
|
76
|
+
|
77
|
+
timestamp = inner_html.xpath("//span[@class[contains(., 'ProfileTweet-action--favorite')]]").first.attr('data-time').to_i
|
35
78
|
new(
|
36
79
|
screen_name: html.attr('data-screen-name'),
|
37
80
|
name: html.attr('data-name'),
|
38
81
|
user_id: html.attr('data-user-id').to_i,
|
39
|
-
tweet_id:
|
82
|
+
tweet_id: tweet_id,
|
83
|
+
text: text,
|
84
|
+
links: links,
|
85
|
+
hashtags: text.scan(/#\w+/).map { |tag| tag.delete_prefix('#') },
|
86
|
+
image_urls: image_urls,
|
87
|
+
video_url: video_url,
|
88
|
+
has_media: has_media,
|
89
|
+
likes: likes,
|
90
|
+
retweets: retweets,
|
91
|
+
replies: replies,
|
92
|
+
is_replied: is_replied,
|
93
|
+
is_reply_to: is_reply_to,
|
94
|
+
parent_tweet_id: parent_tweet_id,
|
95
|
+
reply_to_users: reply_to_users,
|
40
96
|
tweet_url: 'https://twitter.com' + html.attr('data-permalink-path'),
|
41
97
|
created_at: Time.at(timestamp, in: '+00:00'),
|
42
|
-
text: inner_html.xpath("//div[@class[contains(., 'js-tweet-text-container')]]/p[@class[contains(., 'js-tweet-text')]]").first.text,
|
43
98
|
)
|
44
99
|
end
|
45
100
|
end
|
data/lib/version.rb
CHANGED