fbcrawl-colly 0.2.2 → 1.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +0 -2
- data/Dockerfile +14 -0
- data/Gemfile.lock +8 -7
- data/README.md +1 -1
- data/fbcrawl-colly.gemspec +2 -6
- data/fbcrawl.proto +86 -3
- data/{fbcolly → fbcrawl}/fbcolly.go +186 -83
- data/fbcrawl/pb/fbcrawl.pb.go +1749 -0
- data/fbcrawl/pb/fbcrawl_grpc.pb.go +416 -0
- data/go.mod +5 -11
- data/go.sum +21 -10
- data/lib/fbcrawl-colly.rb +1 -4
- data/lib/fbcrawl_colly.rb +5 -0
- data/lib/fbcrawl_colly/client.rb +50 -0
- data/lib/fbcrawl_colly/version.rb +1 -1
- data/lib/pb/fbcrawl_pb.rb +124 -0
- data/lib/pb/fbcrawl_services_pb.rb +32 -0
- data/main.go +82 -71
- metadata +14 -26
- data/ext/fbcrawl_colly/.gitignore +0 -2
- data/ext/fbcrawl_colly/Makefile +0 -6
- data/ext/fbcrawl_colly/extconf.rb +0 -6
- data/lib/fbcrawl_colly/colly.rb +0 -50
- data/lib/fbcrawl_colly/ffi.rb +0 -17
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: eed929533973a523ed9cc5ac0cabf35559cb2e50e1c028a7ef2d8b90de15b58b
|
4
|
+
data.tar.gz: 385c7dcd852e31acdc3d591b323ba4eb2b8d979b89c8fe55a55e18b00fbbb691
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 16777a1a0ae7d48b2ce7ac3bd3573a351903321a009e81d649379aa8f508986ae4511d0ed85101c8605910d383dfa9ba8b7748d759ab05c7a7076e32c5cf49cd
|
7
|
+
data.tar.gz: d902d6226148cadd62d1e0c5a45419cfdd1343831bcfd5dced56bd298982a51d1da1835b048925b347351a01e63e875e8b295e568fa803b512c57183484fd2c6
|
data/.gitignore
CHANGED
data/Dockerfile
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
FROM golang:1.14-alpine
|
2
|
+
RUN apk add --no-cache git build-base tzdata
|
3
|
+
|
4
|
+
RUN mkdir -p /app
|
5
|
+
WORKDIR /app
|
6
|
+
ADD ./go.mod /app
|
7
|
+
ADD ./go.sum /app
|
8
|
+
ADD ./ /app
|
9
|
+
RUN go get
|
10
|
+
|
11
|
+
|
12
|
+
ENV PORT 3000
|
13
|
+
RUN go build -o server qnetwork.net/fbcrawl
|
14
|
+
ENTRYPOINT ["./server"]
|
data/Gemfile.lock
CHANGED
@@ -1,19 +1,21 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
fbcrawl-colly (0.
|
5
|
-
ffi
|
4
|
+
fbcrawl-colly (1.0.1)
|
6
5
|
google-protobuf
|
6
|
+
grpc
|
7
7
|
|
8
8
|
GEM
|
9
9
|
remote: https://rubygems.org/
|
10
10
|
specs:
|
11
|
-
|
12
|
-
|
11
|
+
google-protobuf (3.13.0)
|
12
|
+
googleapis-common-protos-types (1.0.5)
|
13
|
+
google-protobuf (~> 3.11)
|
14
|
+
grpc (1.30.2)
|
15
|
+
google-protobuf (~> 3.12)
|
16
|
+
googleapis-common-protos-types (~> 1.0)
|
13
17
|
minitest (5.14.1)
|
14
18
|
rake (12.3.3)
|
15
|
-
rake-compiler (1.1.1)
|
16
|
-
rake
|
17
19
|
|
18
20
|
PLATFORMS
|
19
21
|
ruby
|
@@ -22,7 +24,6 @@ DEPENDENCIES
|
|
22
24
|
fbcrawl-colly!
|
23
25
|
minitest (~> 5.0)
|
24
26
|
rake (~> 12.0)
|
25
|
-
rake-compiler
|
26
27
|
|
27
28
|
BUNDLED WITH
|
28
29
|
2.1.4
|
data/README.md
CHANGED
data/fbcrawl-colly.gemspec
CHANGED
@@ -25,12 +25,8 @@ Gem::Specification.new do |spec|
|
|
25
25
|
end
|
26
26
|
# spec.bindir = "exe"
|
27
27
|
# spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
28
|
-
spec.
|
29
|
-
'ext/fbcrawl_colly/extconf.rb'
|
30
|
-
]
|
31
|
-
spec.require_paths = ["lib"]
|
28
|
+
spec.require_paths = %w[lib lib/pb]
|
32
29
|
|
33
|
-
spec.add_runtime_dependency 'ffi'
|
34
30
|
spec.add_runtime_dependency 'google-protobuf'
|
35
|
-
spec.
|
31
|
+
spec.add_runtime_dependency 'grpc'
|
36
32
|
end
|
data/fbcrawl.proto
CHANGED
@@ -1,17 +1,92 @@
|
|
1
1
|
syntax = "proto3";
|
2
2
|
|
3
3
|
package fbcrawl_colly;
|
4
|
-
option go_package = "./fbcrawl;
|
4
|
+
option go_package = "./fbcrawl/pb;pb";
|
5
|
+
|
6
|
+
service Grpc {
|
7
|
+
// Sends a greeting
|
8
|
+
rpc Init (Empty) returns (Pointer) {}
|
9
|
+
rpc FreeColly (Pointer) returns (Empty) {}
|
10
|
+
rpc Login (LoginRequest) returns (LoginResponse) {}
|
11
|
+
rpc LoginWithCookies (LoginWithCookiesRequest) returns (Empty) {}
|
12
|
+
rpc FetchGroupInfo (FetchGroupInfoRequest) returns (FacebookGroup) {}
|
13
|
+
rpc FetchUserInfo (FetchUserInfoRequest) returns (FacebookUser) {}
|
14
|
+
rpc FetchGroupFeed (FetchGroupFeedRequest) returns (FacebookPostList) {}
|
15
|
+
rpc FetchPost (FetchPostRequest) returns (FacebookPost) {}
|
16
|
+
rpc FetchContentImages (FetchContentImagesRequest) returns (FacebookImageList) {}
|
17
|
+
rpc FetchImageUrl (FetchImageUrlRequest) returns (FacebookImage) {}
|
18
|
+
}
|
19
|
+
|
20
|
+
message Empty {
|
21
|
+
|
22
|
+
}
|
23
|
+
|
24
|
+
message Pointer {
|
25
|
+
int64 address = 1;
|
26
|
+
}
|
27
|
+
|
28
|
+
message LoginRequest {
|
29
|
+
Pointer pointer = 1;
|
30
|
+
string email = 2;
|
31
|
+
string password = 3;
|
32
|
+
string totp_secret = 4;
|
33
|
+
}
|
34
|
+
|
35
|
+
message LoginResponse {
|
36
|
+
string cookies = 1;
|
37
|
+
}
|
38
|
+
|
39
|
+
message LoginWithCookiesRequest {
|
40
|
+
Pointer pointer = 1;
|
41
|
+
string cookies = 2;
|
42
|
+
}
|
43
|
+
|
44
|
+
message FetchGroupInfoRequest {
|
45
|
+
Pointer pointer = 1;
|
46
|
+
string group_username = 2;
|
47
|
+
}
|
48
|
+
|
49
|
+
message FetchUserInfoRequest {
|
50
|
+
Pointer pointer = 1;
|
51
|
+
string username = 2;
|
52
|
+
}
|
53
|
+
|
54
|
+
message FetchGroupFeedRequest {
|
55
|
+
Pointer pointer = 1;
|
56
|
+
int64 group_id = 2;
|
57
|
+
string next_cursor = 3;
|
58
|
+
}
|
59
|
+
|
60
|
+
message FetchPostRequest {
|
61
|
+
Pointer pointer = 1;
|
62
|
+
int64 group_id = 2;
|
63
|
+
int64 post_id = 3;
|
64
|
+
string comment_next_cursor = 4;
|
65
|
+
}
|
66
|
+
|
67
|
+
message FetchContentImagesRequest {
|
68
|
+
Pointer pointer = 1;
|
69
|
+
int64 post_id = 2;
|
70
|
+
string next_cursor = 3;
|
71
|
+
}
|
72
|
+
|
73
|
+
message FetchImageUrlRequest {
|
74
|
+
Pointer pointer = 1;
|
75
|
+
int64 image_id = 2;
|
76
|
+
}
|
5
77
|
|
6
78
|
// The request message containing the user's name.
|
7
79
|
message FacebookGroup {
|
8
80
|
int64 id = 1;
|
9
81
|
string name = 2;
|
82
|
+
int64 member_count = 3;
|
10
83
|
}
|
11
84
|
|
12
85
|
message FacebookUser {
|
13
86
|
int64 id = 1;
|
14
87
|
string name = 2;
|
88
|
+
string username = 3;
|
89
|
+
int64 friend_count =4;
|
15
90
|
}
|
16
91
|
|
17
92
|
message FacebookPost {
|
@@ -19,12 +94,18 @@ message FacebookPost {
|
|
19
94
|
FacebookGroup group = 2;
|
20
95
|
FacebookUser user = 3;
|
21
96
|
string content = 4;
|
97
|
+
CommentList comments = 5;
|
22
98
|
string content_link = 6;
|
23
|
-
FacebookImage content_image = 8;
|
24
99
|
repeated FacebookImage content_images = 7;
|
25
|
-
|
100
|
+
FacebookImage content_image = 8;
|
26
101
|
int64 created_at = 9;
|
27
102
|
int64 reaction_count = 10;
|
103
|
+
int64 comment_count = 11;
|
104
|
+
}
|
105
|
+
|
106
|
+
message CommentList {
|
107
|
+
repeated FacebookComment comments = 5;
|
108
|
+
string next_cursor = 12;
|
28
109
|
}
|
29
110
|
|
30
111
|
message FacebookImage {
|
@@ -42,8 +123,10 @@ message FacebookComment {
|
|
42
123
|
|
43
124
|
message FacebookPostList {
|
44
125
|
repeated FacebookPost posts = 1;
|
126
|
+
string next_cursor = 2;
|
45
127
|
}
|
46
128
|
|
47
129
|
message FacebookImageList {
|
48
130
|
repeated FacebookImage images = 1;
|
131
|
+
string next_cursor = 2;
|
49
132
|
}
|
@@ -6,16 +6,18 @@ import (
|
|
6
6
|
"errors"
|
7
7
|
"fmt"
|
8
8
|
"github.com/PuerkitoBio/goquery"
|
9
|
-
"github.com/gocolly/colly"
|
10
|
-
"github.com/gocolly/colly/
|
11
|
-
"github.com/gocolly/colly/
|
9
|
+
"github.com/gocolly/colly/v2"
|
10
|
+
"github.com/gocolly/colly/v2/debug"
|
11
|
+
"github.com/gocolly/colly/v2/extensions"
|
12
|
+
"github.com/gocolly/colly/v2/storage"
|
12
13
|
"github.com/google/logger"
|
13
14
|
"github.com/olebedev/when"
|
14
15
|
"github.com/olebedev/when/rules/common"
|
15
16
|
"github.com/olebedev/when/rules/en"
|
16
17
|
"github.com/thoas/go-funk"
|
18
|
+
"github.com/xlzd/gotp"
|
17
19
|
"net/url"
|
18
|
-
"qnetwork.net/fbcrawl/fbcrawl"
|
20
|
+
"qnetwork.net/fbcrawl/fbcrawl/pb"
|
19
21
|
"regexp"
|
20
22
|
"strconv"
|
21
23
|
"strings"
|
@@ -33,7 +35,7 @@ type FbDataInsight struct {
|
|
33
35
|
FbDataPostContext `json:"post_context"`
|
34
36
|
}
|
35
37
|
type FbDataFt struct {
|
36
|
-
ContentOwnerIdNew
|
38
|
+
ContentOwnerIdNew json.Number `json:"content_owner_id_new"`
|
37
39
|
PhotoAttachmentsList []string `json:"photo_attachments_list"`
|
38
40
|
PhotoId int64 `json:"photo_id,string"`
|
39
41
|
PageId int64 `json:"page_id,string"`
|
@@ -42,7 +44,7 @@ type FbDataFt struct {
|
|
42
44
|
}
|
43
45
|
|
44
46
|
func sharedOnRequest(request *colly.Request) {
|
45
|
-
logger.Info("OnRequest")
|
47
|
+
logger.Info("OnRequest ", request.URL)
|
46
48
|
//request.Headers.Set("Host", "facebook.com")
|
47
49
|
request.Headers.Set("Accept-Language", "en-US,en;q=0.9")
|
48
50
|
request.Headers.Set("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
|
@@ -65,6 +67,7 @@ func setupSharedCollector(collector *colly.Collector) error {
|
|
65
67
|
collector.AllowURLRevisit = true
|
66
68
|
collector.OnRequest(sharedOnRequest)
|
67
69
|
collector.OnResponse(sharedOnResponse)
|
70
|
+
collector.SetDebugger(&debug.LogDebugger{})
|
68
71
|
collector.OnError(func(resp *colly.Response, errHttp error) {
|
69
72
|
err = errHttp
|
70
73
|
logger.Error("OnError", err)
|
@@ -106,26 +109,29 @@ func New() *Fbcolly {
|
|
106
109
|
return &f
|
107
110
|
}
|
108
111
|
|
109
|
-
func (f *Fbcolly) Login(email string, password string,
|
112
|
+
func (f *Fbcolly) Login(email string, password string, totpSecret string) (string, error) {
|
110
113
|
collector := f.collector.Clone()
|
111
114
|
err := setupSharedCollector(collector)
|
112
115
|
|
113
116
|
logger.Info("Login using email", email)
|
114
117
|
loggedIn := false
|
115
|
-
|
118
|
+
firstLogin := true
|
116
119
|
collector.OnHTML("#login_form", func(element *colly.HTMLElement) {
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
120
|
+
if firstLogin {
|
121
|
+
firstLogin = false
|
122
|
+
logger.Info("OnHTML login_form")
|
123
|
+
loginURL, err, reqMap := getForm(element, err)
|
124
|
+
if err != nil {
|
125
|
+
logger.Error(err)
|
126
|
+
return
|
127
|
+
}
|
128
|
+
reqMap["email"] = email
|
129
|
+
reqMap["pass"] = password
|
130
|
+
logger.Info("req map:", reqMap)
|
131
|
+
err = collector.Post(loginURL, reqMap)
|
132
|
+
if err != nil {
|
133
|
+
logger.Error("post err:", err)
|
134
|
+
}
|
129
135
|
}
|
130
136
|
})
|
131
137
|
|
@@ -152,9 +158,12 @@ func (f *Fbcolly) Login(email string, password string, otp string) (string, erro
|
|
152
158
|
//logger.Info("Please input OTP")
|
153
159
|
//reader := bufio.NewReader(os.Stdin)
|
154
160
|
//code, _ := reader.ReadString('\n')
|
155
|
-
|
156
|
-
|
157
|
-
|
161
|
+
if len(totpSecret) > 0 {
|
162
|
+
code := gotp.NewDefaultTOTP(totpSecret).Now()
|
163
|
+
reqMap["approvals_code"] = code
|
164
|
+
shouldSubmit = true
|
165
|
+
}
|
166
|
+
|
158
167
|
} else {
|
159
168
|
logger.Info("OnHTML Only Continue checkpoint")
|
160
169
|
|
@@ -188,68 +197,152 @@ func (f *Fbcolly) Login(email string, password string, otp string) (string, erro
|
|
188
197
|
|
189
198
|
}
|
190
199
|
|
191
|
-
func (f *Fbcolly) FetchGroupFeed(groupId int64) (error, *
|
200
|
+
func (f *Fbcolly) FetchGroupFeed(groupId int64, nextCursor string) (error, *pb.FacebookPostList) {
|
192
201
|
collector := f.collector.Clone()
|
193
202
|
err := setupSharedCollector(collector)
|
194
|
-
|
195
|
-
var result []*fbcrawl.FacebookPost
|
203
|
+
result := pb.FacebookPostList{Posts: []*pb.FacebookPost{}}
|
196
204
|
|
197
205
|
collector.OnHTML("#m_group_stories_container > :last-child a", func(element *colly.HTMLElement) {
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
206
|
+
result.NextCursor = "http://mbasic.facebook.com" + element.Attr("href")
|
207
|
+
})
|
208
|
+
collector.OnHTML("#m_group_stories_container div[role=\"article\"]", func(element *colly.HTMLElement) {
|
209
|
+
dataElement := element
|
210
|
+
post := &pb.FacebookPost{}
|
211
|
+
var fbDataFt FbDataFt
|
212
|
+
jsonData := dataElement.Attr("data-ft")
|
213
|
+
|
214
|
+
logger.Info(jsonData)
|
215
|
+
err = json.Unmarshal([]byte(jsonData), &fbDataFt)
|
216
|
+
if err != nil {
|
217
|
+
logger.Error(err)
|
218
|
+
return
|
219
|
+
}
|
220
|
+
logger.Info("Post ", fbDataFt)
|
221
|
+
post.Id = fbDataFt.TopLevelPostId
|
222
|
+
post.Group = &pb.FacebookGroup{Id: fbDataFt.PageId, Name: dataElement.DOM.Find("h3 strong:nth-child(2) a").Text()}
|
223
|
+
userId, _ := fbDataFt.ContentOwnerIdNew.Int64()
|
224
|
+
post.User = &pb.FacebookUser{
|
225
|
+
Id: userId,
|
226
|
+
Name: dataElement.DOM.Find("h3 strong:nth-child(1) a").Text(),
|
202
227
|
}
|
228
|
+
post.CreatedAt = fbDataFt.PageInsights[strconv.FormatInt(fbDataFt.PageId, 10)].PublishTime
|
229
|
+
//Content
|
230
|
+
|
231
|
+
//NO BACKGROUND TEXT ONLY
|
232
|
+
post.Content = strings.Join(dataElement.DOM.Find("p").Map(func(i int, selection *goquery.Selection) string {
|
233
|
+
return selection.Text()
|
234
|
+
}), "\n")
|
235
|
+
|
236
|
+
if len(post.Content) == 0 {
|
237
|
+
// TEXT WITH BACKGROUND
|
238
|
+
post.Content = dataElement.DOM.Find("div[style*=\"background-image:url\"]").Text()
|
239
|
+
}
|
240
|
+
|
241
|
+
post.ContentLink = getUrlFromRedirectHref(dataElement.DOM.Find("a[href*=\"https://lm.facebook.com/l.php\"]").AttrOr("href", ""))
|
242
|
+
post.ReactionCount = getNumberFromText(element.DOM.Find("span[id*=\"like_\"]").Text())
|
243
|
+
post.CommentCount = getNumberFromText(element.DOM.Find("span[id*=\"like_\"] ~ a").Text())
|
244
|
+
post.ContentImages = (funk.Map(fbDataFt.PhotoAttachmentsList, func(id string) *pb.FacebookImage {
|
245
|
+
i, _ := strconv.ParseInt(id, 10, 64)
|
246
|
+
return &pb.FacebookImage{
|
247
|
+
Id: i,
|
248
|
+
}
|
249
|
+
})).([]*pb.FacebookImage)
|
250
|
+
|
251
|
+
if fbDataFt.PhotoId > 0 {
|
252
|
+
post.ContentImage = &pb.FacebookImage{Id: fbDataFt.PhotoId}
|
253
|
+
}
|
254
|
+
result.Posts = append(result.Posts, post)
|
203
255
|
})
|
256
|
+
if len(nextCursor) > 0 {
|
257
|
+
err = collector.Visit(nextCursor)
|
258
|
+
} else {
|
259
|
+
err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/groups/%d", groupId))
|
260
|
+
}
|
204
261
|
|
205
|
-
|
206
|
-
logger.
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
262
|
+
if err != nil {
|
263
|
+
logger.Error("crawl by colly err:", err)
|
264
|
+
}
|
265
|
+
return err, &result
|
266
|
+
}
|
267
|
+
|
268
|
+
func (f *Fbcolly) FetchUserInfo(userIdOrUsername string) (error, *pb.FacebookUser) {
|
269
|
+
collector := f.collector.Clone()
|
270
|
+
err := setupSharedCollector(collector)
|
271
|
+
|
272
|
+
result := &pb.FacebookUser{}
|
273
|
+
|
274
|
+
collector.OnHTML("a[href*=\"lst=\"]", func(element *colly.HTMLElement) {
|
275
|
+
parsed, _ := url.Parse(element.Attr("href"))
|
276
|
+
result.Username = strings.Split(parsed.Path[1:], "/")[0]
|
277
|
+
result.Id = getUserIdFromCommentHref(parsed.Query().Get("lst"))
|
278
|
+
})
|
279
|
+
|
280
|
+
collector.OnHTML("a[href*=\"/friends\"]", func(element *colly.HTMLElement) {
|
281
|
+
result.FriendCount = getNumberFromText(element.Text)
|
282
|
+
})
|
283
|
+
|
284
|
+
collector.OnHTML("#objects_container", func(element *colly.HTMLElement) {
|
285
|
+
result.Name = element.DOM.Find("strong").First().Text()
|
286
|
+
})
|
287
|
+
|
288
|
+
err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/%s", userIdOrUsername))
|
289
|
+
if err != nil {
|
290
|
+
logger.Error("crawl by colly err:", err)
|
291
|
+
}
|
292
|
+
return err, result
|
293
|
+
}
|
294
|
+
|
295
|
+
func (f *Fbcolly) FetchGroupInfo(groupIdOrUsername string) (error, *pb.FacebookGroup) {
|
296
|
+
collector := f.collector.Clone()
|
297
|
+
err := setupSharedCollector(collector)
|
298
|
+
result := &pb.FacebookGroup{}
|
299
|
+
|
300
|
+
collector.OnHTML("a[href=\"#groupMenuBottom\"] h1", func(element *colly.HTMLElement) {
|
301
|
+
result.Name = element.Text
|
302
|
+
})
|
303
|
+
collector.OnHTML("a[href*=\"view=member\"]", func(element *colly.HTMLElement) {
|
304
|
+
result.Id = getNumberFromText(element.Attr("href"))
|
305
|
+
result.MemberCount, _ = strconv.ParseInt(element.DOM.Closest("tr").Find("td:last-child").Text(), 10, 64)
|
214
306
|
})
|
215
307
|
|
216
|
-
err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/groups/%
|
308
|
+
err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/groups/%s?view=info", groupIdOrUsername))
|
217
309
|
if err != nil {
|
218
310
|
logger.Error("crawl by colly err:", err)
|
219
311
|
}
|
220
|
-
return err,
|
312
|
+
return err, result
|
221
313
|
}
|
222
314
|
|
223
|
-
func (f *Fbcolly) FetchContentImages(postId int64) (error, *
|
315
|
+
func (f *Fbcolly) FetchContentImages(postId int64, nextCursor string) (error, *pb.FacebookImageList) {
|
224
316
|
collector := f.collector.Clone()
|
225
317
|
err := setupSharedCollector(collector)
|
226
|
-
|
227
|
-
var result []*fbcrawl.FacebookImage
|
318
|
+
result := pb.FacebookImageList{Images: []*pb.FacebookImage{}}
|
228
319
|
|
229
320
|
collector.OnHTML("a[href*=\"/media/set/\"]", func(element *colly.HTMLElement) {
|
230
|
-
|
231
|
-
logger.Info("Will fetch page", currentPage)
|
232
|
-
err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
|
321
|
+
result.NextCursor = "http://mbasic.facebook.com" + element.Attr("href")
|
233
322
|
})
|
234
323
|
|
235
324
|
collector.OnHTML("a[href*=\"/photo.php\"]", func(element *colly.HTMLElement) {
|
236
|
-
result = append(result, &
|
325
|
+
result.Images = append(result.Images, &pb.FacebookImage{
|
237
326
|
Id: getImageIdFromHref(element.Attr("href")),
|
238
327
|
})
|
239
328
|
//f.detailCollector.Visit(url)
|
240
329
|
})
|
330
|
+
if len(nextCursor) > 0 {
|
331
|
+
err = collector.Visit(nextCursor)
|
332
|
+
} else {
|
333
|
+
err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/media/set/?set=pcb.%d", postId))
|
334
|
+
}
|
241
335
|
|
242
|
-
err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/media/set/?set=pcb.%d", postId))
|
243
336
|
if err != nil {
|
244
337
|
logger.Error("crawl by colly err:", err)
|
245
338
|
}
|
246
|
-
return err, &
|
339
|
+
return err, &result
|
247
340
|
}
|
248
341
|
|
249
|
-
func (f *Fbcolly) FetchImageUrl(imageId int64) (error, *
|
342
|
+
func (f *Fbcolly) FetchImageUrl(imageId int64) (error, *pb.FacebookImage) {
|
250
343
|
collector := f.collector.Clone()
|
251
344
|
err := setupSharedCollector(collector)
|
252
|
-
result :=
|
345
|
+
result := pb.FacebookImage{Id: imageId}
|
253
346
|
|
254
347
|
collector.OnHTML("a", func(element *colly.HTMLElement) {
|
255
348
|
result.Url = element.Attr("href")
|
@@ -262,11 +355,10 @@ func (f *Fbcolly) FetchImageUrl(imageId int64) (error, *fbcrawl.FacebookImage) {
|
|
262
355
|
return err, &result
|
263
356
|
}
|
264
357
|
|
265
|
-
func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *
|
358
|
+
func (f *Fbcolly) FetchPost(groupId int64, postId int64, commentNextCursor string) (error, *pb.FacebookPost) {
|
266
359
|
collector := f.collector.Clone()
|
267
360
|
err := setupSharedCollector(collector)
|
268
|
-
post := &
|
269
|
-
commentPaging := 0
|
361
|
+
post := &pb.FacebookPost{Comments: &pb.CommentList{Comments: []*pb.FacebookComment{}}}
|
270
362
|
collector.OnHTML("#m_story_permalink_view", func(element *colly.HTMLElement) {
|
271
363
|
dataElement := element.DOM.Find("div[data-ft]")
|
272
364
|
if dataElement.Length() > 0 {
|
@@ -281,9 +373,10 @@ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.Facebo
|
|
281
373
|
}
|
282
374
|
logger.Info("Post ", result)
|
283
375
|
post.Id = result.TopLevelPostId
|
284
|
-
post.Group = &
|
285
|
-
|
286
|
-
|
376
|
+
post.Group = &pb.FacebookGroup{Id: result.PageId, Name: dataElement.Find("h3 strong:last-child a").Text()}
|
377
|
+
userId, _ := result.ContentOwnerIdNew.Int64()
|
378
|
+
post.User = &pb.FacebookUser{
|
379
|
+
Id: userId,
|
287
380
|
Name: dataElement.Find("h3 strong:first-child a").Text(),
|
288
381
|
}
|
289
382
|
post.CreatedAt = result.PageInsights[strconv.FormatInt(result.PageId, 10)].PublishTime
|
@@ -300,21 +393,17 @@ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.Facebo
|
|
300
393
|
}
|
301
394
|
|
302
395
|
post.ContentLink = getUrlFromRedirectHref(dataElement.Find("a[href*=\"https://lm.facebook.com/l.php\"]").AttrOr("href", ""))
|
303
|
-
post.ReactionCount =
|
304
|
-
post.ContentImages = (funk.Map(result.PhotoAttachmentsList, func(id string) *
|
396
|
+
post.ReactionCount = getNumberFromText(element.DOM.Find("div[id*=\"sentence_\"]").Text())
|
397
|
+
post.ContentImages = (funk.Map(result.PhotoAttachmentsList, func(id string) *pb.FacebookImage {
|
305
398
|
i, _ := strconv.ParseInt(id, 10, 64)
|
306
|
-
return &
|
399
|
+
return &pb.FacebookImage{
|
307
400
|
Id: i,
|
308
401
|
}
|
309
|
-
})).([]*
|
402
|
+
})).([]*pb.FacebookImage)
|
310
403
|
|
311
404
|
if result.PhotoId > 0 {
|
312
|
-
post.ContentImage = &
|
405
|
+
post.ContentImage = &pb.FacebookImage{Id: result.PhotoId}
|
313
406
|
}
|
314
|
-
|
315
|
-
logger.Info("content", strings.Join(dataElement.Find("p").Map(func(i int, selection *goquery.Selection) string {
|
316
|
-
return selection.Text()
|
317
|
-
}), "\n"))
|
318
407
|
}
|
319
408
|
|
320
409
|
//Comment
|
@@ -323,12 +412,13 @@ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.Facebo
|
|
323
412
|
commentId, _ := strconv.ParseInt(selection.AttrOr("id", ""), 10, 64)
|
324
413
|
logger.Info("comment", commentId)
|
325
414
|
createdAtWhenResult, _ := f.w.Parse(selection.Find("abbr").Text(), time.Now())
|
326
|
-
|
415
|
+
parsed, _ := url.Parse(selection.Find("h3 > a").AttrOr("href", ""))
|
416
|
+
post.Comments.Comments = append(post.Comments.Comments, &pb.FacebookComment{
|
327
417
|
Id: commentId,
|
328
|
-
Post: &
|
329
|
-
User: &
|
330
|
-
|
331
|
-
Name:
|
418
|
+
Post: &pb.FacebookPost{Id: post.Id},
|
419
|
+
User: &pb.FacebookUser{
|
420
|
+
Username: parsed.Path[1:],
|
421
|
+
Name: selection.Find("h3 > a").Text(),
|
332
422
|
},
|
333
423
|
Content: selection.Find("h3 + div").Text(),
|
334
424
|
CreatedAt: createdAtWhenResult.Time.Unix(),
|
@@ -339,14 +429,14 @@ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.Facebo
|
|
339
429
|
})
|
340
430
|
|
341
431
|
collector.OnHTML("div[id*=\"see_prev_\"] > a", func(element *colly.HTMLElement) {
|
342
|
-
|
343
|
-
logger.Info("Comment paging", commentPaging)
|
344
|
-
err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
|
345
|
-
commentPaging = commentPaging + 1
|
346
|
-
}
|
432
|
+
post.Comments.NextCursor = "http://mbasic.facebook.com" + element.Attr("href")
|
347
433
|
})
|
434
|
+
if len(commentNextCursor) > 0 {
|
435
|
+
err = collector.Visit(commentNextCursor)
|
436
|
+
} else {
|
437
|
+
err = collector.Visit(fmt.Sprintf("http://mbasic.facebook.com/groups/%d?view=permalink&id=%d&_rdr", groupId, postId))
|
438
|
+
}
|
348
439
|
|
349
|
-
err = collector.Visit(fmt.Sprintf("http://mbasic.facebook.com/groups/%d?view=permalink&id=%d&_rdr", groupId, postId))
|
350
440
|
return err, post
|
351
441
|
}
|
352
442
|
|
@@ -360,8 +450,12 @@ func (f *Fbcolly) LoginWithCookies(cookies string) error {
|
|
360
450
|
//}
|
361
451
|
|
362
452
|
func getUserIdFromCommentHref(href string) int64 {
|
363
|
-
|
364
|
-
|
453
|
+
match := regexp.MustCompile("\\d+:(\\d+)\\d+").FindStringSubmatch(href)
|
454
|
+
if len(match) > 0 {
|
455
|
+
id, _ := strconv.ParseInt(match[1], 10, 64)
|
456
|
+
return id
|
457
|
+
}
|
458
|
+
return 0
|
365
459
|
}
|
366
460
|
|
367
461
|
func getUrlFromRedirectHref(href string) string {
|
@@ -375,11 +469,20 @@ func getImageIdFromHref(href string) int64 {
|
|
375
469
|
return i
|
376
470
|
}
|
377
471
|
|
378
|
-
func
|
472
|
+
func getNumberFromText(text string) int64 {
|
379
473
|
logger.Error("reaction", text)
|
380
474
|
if len(text) > 0 {
|
381
|
-
|
382
|
-
|
475
|
+
match := regexp.MustCompile("(\\d+)\\s?([km]?)").FindStringSubmatch(text)
|
476
|
+
if len(match) > 0 {
|
477
|
+
count, _ := strconv.ParseInt(match[1], 10, 64)
|
478
|
+
switch match[2] {
|
479
|
+
case "k":
|
480
|
+
count *= 1000
|
481
|
+
case "m":
|
482
|
+
count *= 1000000
|
483
|
+
}
|
484
|
+
return count
|
485
|
+
}
|
383
486
|
}
|
384
487
|
return 0
|
385
488
|
}
|