fbcrawl-colly 0.2.3 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +0 -2
- data/Dockerfile +14 -0
- data/Gemfile.lock +8 -7
- data/README.md +1 -1
- data/fbcrawl-colly.gemspec +2 -6
- data/fbcrawl.proto +80 -3
- data/{fbcolly → fbcrawl}/fbcolly.go +154 -90
- data/fbcrawl/pb/fbcrawl.pb.go +1706 -0
- data/fbcrawl/pb/fbcrawl_grpc.pb.go +308 -0
- data/go.mod +4 -11
- data/go.sum +18 -10
- data/lib/fbcrawl-colly.rb +1 -4
- data/lib/fbcrawl_colly.rb +5 -0
- data/lib/fbcrawl_colly/client.rb +50 -0
- data/lib/fbcrawl_colly/version.rb +1 -1
- data/lib/pb/fbcrawl_pb.rb +122 -0
- data/lib/pb/fbcrawl_services_pb.rb +29 -0
- data/main.go +66 -74
- metadata +14 -26
- data/ext/fbcrawl_colly/.gitignore +0 -2
- data/ext/fbcrawl_colly/Makefile +0 -6
- data/ext/fbcrawl_colly/extconf.rb +0 -6
- data/lib/fbcrawl_colly/colly.rb +0 -50
- data/lib/fbcrawl_colly/ffi.rb +0 -17
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 138016b11ed36541b96961dbfad0627b5d8bccd26a7638f95e5a8806d0fadf31
|
4
|
+
data.tar.gz: 376c58bc102d6ece54631f0151889d7546af436e9f40f64d994ac44b7110e29e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d8580cf0cf980989aab899d4f1185a33dff13743cfedea5487e59f8a990cd23d8c9c4751e18c2192b396c27c4ef5d5e95919404522fc4399b9754c1c0753f63a
|
7
|
+
data.tar.gz: 84b182c8dcfd4a3b5aa7297c6cd3bca485a4f372d65579f03220c32bcf2fcda1d95be02cff4caf5c0bb88a997daa53684b6f25fc8c088be4e7e31406c4f1f4ba
|
data/.gitignore
CHANGED
data/Dockerfile
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
FROM golang:1.14-alpine
|
2
|
+
RUN apk add --no-cache git build-base tzdata
|
3
|
+
|
4
|
+
RUN mkdir -p /app
|
5
|
+
WORKDIR /app
|
6
|
+
ADD ./go.mod /app
|
7
|
+
ADD ./go.sum /app
|
8
|
+
ADD ./ /app
|
9
|
+
RUN go get
|
10
|
+
|
11
|
+
|
12
|
+
ENV PORT 3000
|
13
|
+
RUN go build -o server qnetwork.net/fbcrawl
|
14
|
+
ENTRYPOINT ["./server"]
|
data/Gemfile.lock
CHANGED
@@ -1,19 +1,21 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
fbcrawl-colly (
|
5
|
-
ffi
|
4
|
+
fbcrawl-colly (1.1.0)
|
6
5
|
google-protobuf
|
6
|
+
grpc
|
7
7
|
|
8
8
|
GEM
|
9
9
|
remote: https://rubygems.org/
|
10
10
|
specs:
|
11
|
-
|
12
|
-
|
11
|
+
google-protobuf (3.13.0)
|
12
|
+
googleapis-common-protos-types (1.0.5)
|
13
|
+
google-protobuf (~> 3.11)
|
14
|
+
grpc (1.31.1)
|
15
|
+
google-protobuf (~> 3.12)
|
16
|
+
googleapis-common-protos-types (~> 1.0)
|
13
17
|
minitest (5.14.1)
|
14
18
|
rake (12.3.3)
|
15
|
-
rake-compiler (1.1.1)
|
16
|
-
rake
|
17
19
|
|
18
20
|
PLATFORMS
|
19
21
|
ruby
|
@@ -22,7 +24,6 @@ DEPENDENCIES
|
|
22
24
|
fbcrawl-colly!
|
23
25
|
minitest (~> 5.0)
|
24
26
|
rake (~> 12.0)
|
25
|
-
rake-compiler
|
26
27
|
|
27
28
|
BUNDLED WITH
|
28
29
|
2.1.4
|
data/README.md
CHANGED
data/fbcrawl-colly.gemspec
CHANGED
@@ -25,12 +25,8 @@ Gem::Specification.new do |spec|
|
|
25
25
|
end
|
26
26
|
# spec.bindir = "exe"
|
27
27
|
# spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
28
|
-
spec.
|
29
|
-
'ext/fbcrawl_colly/extconf.rb'
|
30
|
-
]
|
31
|
-
spec.require_paths = ["lib"]
|
28
|
+
spec.require_paths = %w[lib lib/pb]
|
32
29
|
|
33
|
-
spec.add_runtime_dependency 'ffi'
|
34
30
|
spec.add_runtime_dependency 'google-protobuf'
|
35
|
-
spec.
|
31
|
+
spec.add_runtime_dependency 'grpc'
|
36
32
|
end
|
data/fbcrawl.proto
CHANGED
@@ -1,17 +1,87 @@
|
|
1
1
|
syntax = "proto3";
|
2
2
|
|
3
3
|
package fbcrawl_colly;
|
4
|
-
option go_package = "./fbcrawl;
|
4
|
+
option go_package = "./fbcrawl/pb;pb";
|
5
|
+
|
6
|
+
service Grpc {
|
7
|
+
// Sends a greeting
|
8
|
+
rpc Login (LoginRequest) returns (LoginResponse) {}
|
9
|
+
rpc FetchGroupInfo (FetchGroupInfoRequest) returns (FacebookGroup) {}
|
10
|
+
rpc FetchUserInfo (FetchUserInfoRequest) returns (FacebookUser) {}
|
11
|
+
rpc FetchGroupFeed (FetchGroupFeedRequest) returns (FacebookPostList) {}
|
12
|
+
rpc FetchPost (FetchPostRequest) returns (FacebookPost) {}
|
13
|
+
rpc FetchContentImages (FetchContentImagesRequest) returns (FacebookImageList) {}
|
14
|
+
rpc FetchImageUrl (FetchImageUrlRequest) returns (FacebookImage) {}
|
15
|
+
}
|
16
|
+
|
17
|
+
message Empty {
|
18
|
+
|
19
|
+
}
|
20
|
+
|
21
|
+
message Context {
|
22
|
+
string cookies = 1;
|
23
|
+
}
|
24
|
+
|
25
|
+
message LoginRequest {
|
26
|
+
string email = 2;
|
27
|
+
string password = 3;
|
28
|
+
string totp_secret = 4;
|
29
|
+
}
|
30
|
+
|
31
|
+
message LoginResponse {
|
32
|
+
string cookies = 1;
|
33
|
+
}
|
34
|
+
|
35
|
+
message LoginWithCookiesRequest {
|
36
|
+
string cookies = 1;
|
37
|
+
}
|
38
|
+
|
39
|
+
message FetchGroupInfoRequest {
|
40
|
+
Context context = 1;
|
41
|
+
string group_username = 2;
|
42
|
+
}
|
43
|
+
|
44
|
+
message FetchUserInfoRequest {
|
45
|
+
Context context = 1;
|
46
|
+
string username = 2;
|
47
|
+
}
|
48
|
+
|
49
|
+
message FetchGroupFeedRequest {
|
50
|
+
Context context = 1;
|
51
|
+
int64 group_id = 2;
|
52
|
+
string next_cursor = 3;
|
53
|
+
}
|
54
|
+
|
55
|
+
message FetchPostRequest {
|
56
|
+
Context context = 1;
|
57
|
+
int64 group_id = 2;
|
58
|
+
int64 post_id = 3;
|
59
|
+
string comment_next_cursor = 4;
|
60
|
+
}
|
61
|
+
|
62
|
+
message FetchContentImagesRequest {
|
63
|
+
Context context = 1;
|
64
|
+
int64 post_id = 2;
|
65
|
+
string next_cursor = 3;
|
66
|
+
}
|
67
|
+
|
68
|
+
message FetchImageUrlRequest {
|
69
|
+
Context context = 1;
|
70
|
+
int64 image_id = 2;
|
71
|
+
}
|
5
72
|
|
6
73
|
// The request message containing the user's name.
|
7
74
|
message FacebookGroup {
|
8
75
|
int64 id = 1;
|
9
76
|
string name = 2;
|
77
|
+
int64 member_count = 3;
|
10
78
|
}
|
11
79
|
|
12
80
|
message FacebookUser {
|
13
81
|
int64 id = 1;
|
14
82
|
string name = 2;
|
83
|
+
string username = 3;
|
84
|
+
int64 friend_count = 4;
|
15
85
|
}
|
16
86
|
|
17
87
|
message FacebookPost {
|
@@ -19,15 +89,20 @@ message FacebookPost {
|
|
19
89
|
FacebookGroup group = 2;
|
20
90
|
FacebookUser user = 3;
|
21
91
|
string content = 4;
|
92
|
+
CommentList comments = 5;
|
22
93
|
string content_link = 6;
|
23
|
-
FacebookImage content_image = 8;
|
24
94
|
repeated FacebookImage content_images = 7;
|
25
|
-
|
95
|
+
FacebookImage content_image = 8;
|
26
96
|
int64 created_at = 9;
|
27
97
|
int64 reaction_count = 10;
|
28
98
|
int64 comment_count = 11;
|
29
99
|
}
|
30
100
|
|
101
|
+
message CommentList {
|
102
|
+
repeated FacebookComment comments = 5;
|
103
|
+
string next_cursor = 12;
|
104
|
+
}
|
105
|
+
|
31
106
|
message FacebookImage {
|
32
107
|
int64 id = 1;
|
33
108
|
string url = 2;
|
@@ -43,8 +118,10 @@ message FacebookComment {
|
|
43
118
|
|
44
119
|
message FacebookPostList {
|
45
120
|
repeated FacebookPost posts = 1;
|
121
|
+
string next_cursor = 2;
|
46
122
|
}
|
47
123
|
|
48
124
|
message FacebookImageList {
|
49
125
|
repeated FacebookImage images = 1;
|
126
|
+
string next_cursor = 2;
|
50
127
|
}
|
@@ -6,16 +6,17 @@ import (
|
|
6
6
|
"errors"
|
7
7
|
"fmt"
|
8
8
|
"github.com/PuerkitoBio/goquery"
|
9
|
-
"github.com/gocolly/colly"
|
10
|
-
"github.com/gocolly/colly/extensions"
|
11
|
-
"github.com/gocolly/colly/storage"
|
9
|
+
"github.com/gocolly/colly/v2"
|
10
|
+
"github.com/gocolly/colly/v2/extensions"
|
11
|
+
"github.com/gocolly/colly/v2/storage"
|
12
12
|
"github.com/google/logger"
|
13
13
|
"github.com/olebedev/when"
|
14
14
|
"github.com/olebedev/when/rules/common"
|
15
15
|
"github.com/olebedev/when/rules/en"
|
16
16
|
"github.com/thoas/go-funk"
|
17
|
+
"github.com/xlzd/gotp"
|
17
18
|
"net/url"
|
18
|
-
"qnetwork.net/fbcrawl/fbcrawl"
|
19
|
+
"qnetwork.net/fbcrawl/fbcrawl/pb"
|
19
20
|
"regexp"
|
20
21
|
"strconv"
|
21
22
|
"strings"
|
@@ -33,7 +34,7 @@ type FbDataInsight struct {
|
|
33
34
|
FbDataPostContext `json:"post_context"`
|
34
35
|
}
|
35
36
|
type FbDataFt struct {
|
36
|
-
ContentOwnerIdNew
|
37
|
+
ContentOwnerIdNew json.Number `json:"content_owner_id_new"`
|
37
38
|
PhotoAttachmentsList []string `json:"photo_attachments_list"`
|
38
39
|
PhotoId int64 `json:"photo_id,string"`
|
39
40
|
PageId int64 `json:"page_id,string"`
|
@@ -42,7 +43,7 @@ type FbDataFt struct {
|
|
42
43
|
}
|
43
44
|
|
44
45
|
func sharedOnRequest(request *colly.Request) {
|
45
|
-
logger.Info("OnRequest")
|
46
|
+
logger.Info("OnRequest ", request.URL)
|
46
47
|
//request.Headers.Set("Host", "facebook.com")
|
47
48
|
request.Headers.Set("Accept-Language", "en-US,en;q=0.9")
|
48
49
|
request.Headers.Set("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
|
@@ -106,26 +107,29 @@ func New() *Fbcolly {
|
|
106
107
|
return &f
|
107
108
|
}
|
108
109
|
|
109
|
-
func (f *Fbcolly) Login(email string, password string,
|
110
|
+
func (f *Fbcolly) Login(email string, password string, totpSecret string) (string, error) {
|
110
111
|
collector := f.collector.Clone()
|
111
112
|
err := setupSharedCollector(collector)
|
112
113
|
|
113
114
|
logger.Info("Login using email", email)
|
114
115
|
loggedIn := false
|
115
|
-
|
116
|
+
firstLogin := true
|
116
117
|
collector.OnHTML("#login_form", func(element *colly.HTMLElement) {
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
118
|
+
if firstLogin {
|
119
|
+
firstLogin = false
|
120
|
+
logger.Info("OnHTML login_form")
|
121
|
+
loginURL, err, reqMap := getForm(element, err)
|
122
|
+
if err != nil {
|
123
|
+
logger.Error(err)
|
124
|
+
return
|
125
|
+
}
|
126
|
+
reqMap["email"] = email
|
127
|
+
reqMap["pass"] = password
|
128
|
+
logger.Info("req map:", reqMap)
|
129
|
+
err = collector.Post(loginURL, reqMap)
|
130
|
+
if err != nil {
|
131
|
+
logger.Error("post err:", err)
|
132
|
+
}
|
129
133
|
}
|
130
134
|
})
|
131
135
|
|
@@ -152,9 +156,12 @@ func (f *Fbcolly) Login(email string, password string, otp string) (string, erro
|
|
152
156
|
//logger.Info("Please input OTP")
|
153
157
|
//reader := bufio.NewReader(os.Stdin)
|
154
158
|
//code, _ := reader.ReadString('\n')
|
155
|
-
|
156
|
-
|
157
|
-
|
159
|
+
if len(totpSecret) > 0 {
|
160
|
+
code := gotp.NewDefaultTOTP(totpSecret).Now()
|
161
|
+
reqMap["approvals_code"] = code
|
162
|
+
shouldSubmit = true
|
163
|
+
}
|
164
|
+
|
158
165
|
} else {
|
159
166
|
logger.Info("OnHTML Only Continue checkpoint")
|
160
167
|
|
@@ -188,22 +195,17 @@ func (f *Fbcolly) Login(email string, password string, otp string) (string, erro
|
|
188
195
|
|
189
196
|
}
|
190
197
|
|
191
|
-
func (f *Fbcolly) FetchGroupFeed(groupId int64) (error, *
|
198
|
+
func (f *Fbcolly) FetchGroupFeed(groupId int64, nextCursor string) (error, *pb.FacebookPostList) {
|
192
199
|
collector := f.collector.Clone()
|
193
200
|
err := setupSharedCollector(collector)
|
194
|
-
|
195
|
-
var result []*fbcrawl.FacebookPost
|
201
|
+
result := pb.FacebookPostList{Posts: []*pb.FacebookPost{}}
|
196
202
|
|
197
203
|
collector.OnHTML("#m_group_stories_container > :last-child a", func(element *colly.HTMLElement) {
|
198
|
-
|
199
|
-
if currentPage < 3 {
|
200
|
-
logger.Info("Will fetch page", currentPage)
|
201
|
-
err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
|
202
|
-
}
|
204
|
+
result.NextCursor = "http://mbasic.facebook.com" + element.Attr("href")
|
203
205
|
})
|
204
|
-
collector.OnHTML("div[role=\"article\"]", func(element *colly.HTMLElement) {
|
206
|
+
collector.OnHTML("#m_group_stories_container div[role=\"article\"]", func(element *colly.HTMLElement) {
|
205
207
|
dataElement := element
|
206
|
-
post := &
|
208
|
+
post := &pb.FacebookPost{}
|
207
209
|
var fbDataFt FbDataFt
|
208
210
|
jsonData := dataElement.Attr("data-ft")
|
209
211
|
|
@@ -215,9 +217,10 @@ func (f *Fbcolly) FetchGroupFeed(groupId int64) (error, *fbcrawl.FacebookPostLis
|
|
215
217
|
}
|
216
218
|
logger.Info("Post ", fbDataFt)
|
217
219
|
post.Id = fbDataFt.TopLevelPostId
|
218
|
-
post.Group = &
|
219
|
-
|
220
|
-
|
220
|
+
post.Group = &pb.FacebookGroup{Id: fbDataFt.PageId, Name: dataElement.DOM.Find("h3 strong:nth-child(2) a").Text()}
|
221
|
+
userId, _ := fbDataFt.ContentOwnerIdNew.Int64()
|
222
|
+
post.User = &pb.FacebookUser{
|
223
|
+
Id: userId,
|
221
224
|
Name: dataElement.DOM.Find("h3 strong:nth-child(1) a").Text(),
|
222
225
|
}
|
223
226
|
post.CreatedAt = fbDataFt.PageInsights[strconv.FormatInt(fbDataFt.PageId, 10)].PublishTime
|
@@ -236,56 +239,108 @@ func (f *Fbcolly) FetchGroupFeed(groupId int64) (error, *fbcrawl.FacebookPostLis
|
|
236
239
|
post.ContentLink = getUrlFromRedirectHref(dataElement.DOM.Find("a[href*=\"https://lm.facebook.com/l.php\"]").AttrOr("href", ""))
|
237
240
|
post.ReactionCount = getNumberFromText(element.DOM.Find("span[id*=\"like_\"]").Text())
|
238
241
|
post.CommentCount = getNumberFromText(element.DOM.Find("span[id*=\"like_\"] ~ a").Text())
|
239
|
-
post.ContentImages = (funk.Map(fbDataFt.PhotoAttachmentsList, func(id string) *
|
242
|
+
post.ContentImages = (funk.Map(fbDataFt.PhotoAttachmentsList, func(id string) *pb.FacebookImage {
|
240
243
|
i, _ := strconv.ParseInt(id, 10, 64)
|
241
|
-
return &
|
244
|
+
return &pb.FacebookImage{
|
242
245
|
Id: i,
|
243
246
|
}
|
244
|
-
})).([]*
|
247
|
+
})).([]*pb.FacebookImage)
|
245
248
|
|
246
249
|
if fbDataFt.PhotoId > 0 {
|
247
|
-
post.ContentImage = &
|
250
|
+
post.ContentImage = &pb.FacebookImage{Id: fbDataFt.PhotoId}
|
248
251
|
}
|
249
|
-
result = append(result, post)
|
252
|
+
result.Posts = append(result.Posts, post)
|
253
|
+
})
|
254
|
+
if len(nextCursor) > 0 {
|
255
|
+
err = collector.Visit(nextCursor)
|
256
|
+
} else {
|
257
|
+
err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/groups/%d", groupId))
|
258
|
+
}
|
259
|
+
|
260
|
+
if err != nil {
|
261
|
+
logger.Error("crawl by colly err:", err)
|
262
|
+
}
|
263
|
+
return err, &result
|
264
|
+
}
|
265
|
+
|
266
|
+
func (f *Fbcolly) FetchUserInfo(userIdOrUsername string) (error, *pb.FacebookUser) {
|
267
|
+
collector := f.collector.Clone()
|
268
|
+
err := setupSharedCollector(collector)
|
269
|
+
|
270
|
+
result := &pb.FacebookUser{}
|
271
|
+
|
272
|
+
collector.OnHTML("a[href*=\"lst=\"]", func(element *colly.HTMLElement) {
|
273
|
+
parsed, _ := url.Parse(element.Attr("href"))
|
274
|
+
result.Username = strings.Split(parsed.Path[1:], "/")[0]
|
275
|
+
result.Id = getUserIdFromCommentHref(parsed.Query().Get("lst"))
|
276
|
+
})
|
277
|
+
|
278
|
+
collector.OnHTML("a[href*=\"/friends\"]", func(element *colly.HTMLElement) {
|
279
|
+
result.FriendCount = getNumberFromText(element.Text)
|
250
280
|
})
|
251
281
|
|
252
|
-
|
282
|
+
collector.OnHTML("#objects_container", func(element *colly.HTMLElement) {
|
283
|
+
result.Name = element.DOM.Find("strong").First().Text()
|
284
|
+
})
|
285
|
+
|
286
|
+
err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/%s", userIdOrUsername))
|
253
287
|
if err != nil {
|
254
288
|
logger.Error("crawl by colly err:", err)
|
255
289
|
}
|
256
|
-
return err,
|
290
|
+
return err, result
|
257
291
|
}
|
258
292
|
|
259
|
-
func (f *Fbcolly)
|
293
|
+
func (f *Fbcolly) FetchGroupInfo(groupIdOrUsername string) (error, *pb.FacebookGroup) {
|
260
294
|
collector := f.collector.Clone()
|
261
295
|
err := setupSharedCollector(collector)
|
262
|
-
|
263
|
-
|
296
|
+
result := &pb.FacebookGroup{}
|
297
|
+
|
298
|
+
collector.OnHTML("a[href=\"#groupMenuBottom\"] h1", func(element *colly.HTMLElement) {
|
299
|
+
result.Name = element.Text
|
300
|
+
})
|
301
|
+
collector.OnHTML("a[href*=\"view=member\"]", func(element *colly.HTMLElement) {
|
302
|
+
result.Id = getNumberFromText(element.Attr("href"))
|
303
|
+
result.MemberCount, _ = strconv.ParseInt(element.DOM.Closest("tr").Find("td:last-child").Text(), 10, 64)
|
304
|
+
})
|
305
|
+
|
306
|
+
err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/groups/%s?view=info", groupIdOrUsername))
|
307
|
+
if err != nil {
|
308
|
+
logger.Error("crawl by colly err:", err)
|
309
|
+
}
|
310
|
+
return err, result
|
311
|
+
}
|
312
|
+
|
313
|
+
func (f *Fbcolly) FetchContentImages(postId int64, nextCursor string) (error, *pb.FacebookImageList) {
|
314
|
+
collector := f.collector.Clone()
|
315
|
+
err := setupSharedCollector(collector)
|
316
|
+
result := pb.FacebookImageList{Images: []*pb.FacebookImage{}}
|
264
317
|
|
265
318
|
collector.OnHTML("a[href*=\"/media/set/\"]", func(element *colly.HTMLElement) {
|
266
|
-
|
267
|
-
logger.Info("Will fetch page", currentPage)
|
268
|
-
err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
|
319
|
+
result.NextCursor = "http://mbasic.facebook.com" + element.Attr("href")
|
269
320
|
})
|
270
321
|
|
271
322
|
collector.OnHTML("a[href*=\"/photo.php\"]", func(element *colly.HTMLElement) {
|
272
|
-
result = append(result, &
|
323
|
+
result.Images = append(result.Images, &pb.FacebookImage{
|
273
324
|
Id: getImageIdFromHref(element.Attr("href")),
|
274
325
|
})
|
275
326
|
//f.detailCollector.Visit(url)
|
276
327
|
})
|
328
|
+
if len(nextCursor) > 0 {
|
329
|
+
err = collector.Visit(nextCursor)
|
330
|
+
} else {
|
331
|
+
err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/media/set/?set=pcb.%d", postId))
|
332
|
+
}
|
277
333
|
|
278
|
-
err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/media/set/?set=pcb.%d", postId))
|
279
334
|
if err != nil {
|
280
335
|
logger.Error("crawl by colly err:", err)
|
281
336
|
}
|
282
|
-
return err, &
|
337
|
+
return err, &result
|
283
338
|
}
|
284
339
|
|
285
|
-
func (f *Fbcolly) FetchImageUrl(imageId int64) (error, *
|
340
|
+
func (f *Fbcolly) FetchImageUrl(imageId int64) (error, *pb.FacebookImage) {
|
286
341
|
collector := f.collector.Clone()
|
287
342
|
err := setupSharedCollector(collector)
|
288
|
-
result :=
|
343
|
+
result := pb.FacebookImage{Id: imageId}
|
289
344
|
|
290
345
|
collector.OnHTML("a", func(element *colly.HTMLElement) {
|
291
346
|
result.Url = element.Attr("href")
|
@@ -298,11 +353,10 @@ func (f *Fbcolly) FetchImageUrl(imageId int64) (error, *fbcrawl.FacebookImage) {
|
|
298
353
|
return err, &result
|
299
354
|
}
|
300
355
|
|
301
|
-
func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *
|
356
|
+
func (f *Fbcolly) FetchPost(groupId int64, postId int64, commentNextCursor string) (error, *pb.FacebookPost) {
|
302
357
|
collector := f.collector.Clone()
|
303
358
|
err := setupSharedCollector(collector)
|
304
|
-
post := &
|
305
|
-
commentPaging := 0
|
359
|
+
post := &pb.FacebookPost{Comments: &pb.CommentList{Comments: []*pb.FacebookComment{}}}
|
306
360
|
collector.OnHTML("#m_story_permalink_view", func(element *colly.HTMLElement) {
|
307
361
|
dataElement := element.DOM.Find("div[data-ft]")
|
308
362
|
if dataElement.Length() > 0 {
|
@@ -317,9 +371,10 @@ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.Facebo
|
|
317
371
|
}
|
318
372
|
logger.Info("Post ", result)
|
319
373
|
post.Id = result.TopLevelPostId
|
320
|
-
post.Group = &
|
321
|
-
|
322
|
-
|
374
|
+
post.Group = &pb.FacebookGroup{Id: result.PageId, Name: dataElement.Find("h3 strong:last-child a").Text()}
|
375
|
+
userId, _ := result.ContentOwnerIdNew.Int64()
|
376
|
+
post.User = &pb.FacebookUser{
|
377
|
+
Id: userId,
|
323
378
|
Name: dataElement.Find("h3 strong:first-child a").Text(),
|
324
379
|
}
|
325
380
|
post.CreatedAt = result.PageInsights[strconv.FormatInt(result.PageId, 10)].PublishTime
|
@@ -337,15 +392,15 @@ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.Facebo
|
|
337
392
|
|
338
393
|
post.ContentLink = getUrlFromRedirectHref(dataElement.Find("a[href*=\"https://lm.facebook.com/l.php\"]").AttrOr("href", ""))
|
339
394
|
post.ReactionCount = getNumberFromText(element.DOM.Find("div[id*=\"sentence_\"]").Text())
|
340
|
-
post.ContentImages = (funk.Map(result.PhotoAttachmentsList, func(id string) *
|
395
|
+
post.ContentImages = (funk.Map(result.PhotoAttachmentsList, func(id string) *pb.FacebookImage {
|
341
396
|
i, _ := strconv.ParseInt(id, 10, 64)
|
342
|
-
return &
|
397
|
+
return &pb.FacebookImage{
|
343
398
|
Id: i,
|
344
399
|
}
|
345
|
-
})).([]*
|
400
|
+
})).([]*pb.FacebookImage)
|
346
401
|
|
347
402
|
if result.PhotoId > 0 {
|
348
|
-
post.ContentImage = &
|
403
|
+
post.ContentImage = &pb.FacebookImage{Id: result.PhotoId}
|
349
404
|
}
|
350
405
|
}
|
351
406
|
|
@@ -355,30 +410,33 @@ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.Facebo
|
|
355
410
|
commentId, _ := strconv.ParseInt(selection.AttrOr("id", ""), 10, 64)
|
356
411
|
logger.Info("comment", commentId)
|
357
412
|
createdAtWhenResult, _ := f.w.Parse(selection.Find("abbr").Text(), time.Now())
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
Id:
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
413
|
+
parsed, _ := url.Parse(selection.Find("h3 > a").AttrOr("href", ""))
|
414
|
+
if len(parsed.Path) > 1 {
|
415
|
+
post.Comments.Comments = append(post.Comments.Comments, &pb.FacebookComment{
|
416
|
+
Id: commentId,
|
417
|
+
Post: &pb.FacebookPost{Id: post.Id},
|
418
|
+
User: &pb.FacebookUser{
|
419
|
+
Username: parsed.Path[1:],
|
420
|
+
Name: selection.Find("h3 > a").Text(),
|
421
|
+
},
|
422
|
+
Content: selection.Find("h3 + div").Text(),
|
423
|
+
CreatedAt: createdAtWhenResult.Time.Unix(),
|
424
|
+
})
|
425
|
+
}
|
368
426
|
})
|
369
427
|
|
370
428
|
}
|
371
429
|
})
|
372
430
|
|
373
431
|
collector.OnHTML("div[id*=\"see_prev_\"] > a", func(element *colly.HTMLElement) {
|
374
|
-
|
375
|
-
logger.Info("Comment paging", commentPaging)
|
376
|
-
err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
|
377
|
-
commentPaging = commentPaging + 1
|
378
|
-
}
|
432
|
+
post.Comments.NextCursor = "http://mbasic.facebook.com" + element.Attr("href")
|
379
433
|
})
|
434
|
+
if len(commentNextCursor) > 0 {
|
435
|
+
err = collector.Visit(commentNextCursor)
|
436
|
+
} else {
|
437
|
+
err = collector.Visit(fmt.Sprintf("http://mbasic.facebook.com/groups/%d?view=permalink&id=%d&_rdr", groupId, postId))
|
438
|
+
}
|
380
439
|
|
381
|
-
err = collector.Visit(fmt.Sprintf("http://mbasic.facebook.com/groups/%d?view=permalink&id=%d&_rdr", groupId, postId))
|
382
440
|
return err, post
|
383
441
|
}
|
384
442
|
|
@@ -392,8 +450,12 @@ func (f *Fbcolly) LoginWithCookies(cookies string) error {
|
|
392
450
|
//}
|
393
451
|
|
394
452
|
func getUserIdFromCommentHref(href string) int64 {
|
395
|
-
|
396
|
-
|
453
|
+
match := regexp.MustCompile("\\d+:(\\d+)\\d+").FindStringSubmatch(href)
|
454
|
+
if len(match) > 0 {
|
455
|
+
id, _ := strconv.ParseInt(match[1], 10, 64)
|
456
|
+
return id
|
457
|
+
}
|
458
|
+
return 0
|
397
459
|
}
|
398
460
|
|
399
461
|
func getUrlFromRedirectHref(href string) string {
|
@@ -410,15 +472,17 @@ func getImageIdFromHref(href string) int64 {
|
|
410
472
|
func getNumberFromText(text string) int64 {
|
411
473
|
logger.Error("reaction", text)
|
412
474
|
if len(text) > 0 {
|
413
|
-
match := regexp.MustCompile("(\\d
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
|
475
|
+
match := regexp.MustCompile("(\\d+)\\s?([km]?)").FindStringSubmatch(text)
|
476
|
+
if len(match) > 0 {
|
477
|
+
count, _ := strconv.ParseInt(match[1], 10, 64)
|
478
|
+
switch match[2] {
|
479
|
+
case "k":
|
480
|
+
count *= 1000
|
481
|
+
case "m":
|
482
|
+
count *= 1000000
|
483
|
+
}
|
484
|
+
return count
|
420
485
|
}
|
421
|
-
return count
|
422
486
|
}
|
423
487
|
return 0
|
424
488
|
}
|