fbcrawl-colly 0.2.2 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +0 -2
- data/Dockerfile +14 -0
- data/Gemfile.lock +8 -7
- data/README.md +1 -1
- data/fbcrawl-colly.gemspec +2 -6
- data/fbcrawl.proto +86 -3
- data/{fbcolly → fbcrawl}/fbcolly.go +186 -83
- data/fbcrawl/pb/fbcrawl.pb.go +1749 -0
- data/fbcrawl/pb/fbcrawl_grpc.pb.go +416 -0
- data/go.mod +5 -11
- data/go.sum +21 -10
- data/lib/fbcrawl-colly.rb +1 -4
- data/lib/fbcrawl_colly.rb +5 -0
- data/lib/fbcrawl_colly/client.rb +50 -0
- data/lib/fbcrawl_colly/version.rb +1 -1
- data/lib/pb/fbcrawl_pb.rb +124 -0
- data/lib/pb/fbcrawl_services_pb.rb +32 -0
- data/main.go +82 -71
- metadata +14 -26
- data/ext/fbcrawl_colly/.gitignore +0 -2
- data/ext/fbcrawl_colly/Makefile +0 -6
- data/ext/fbcrawl_colly/extconf.rb +0 -6
- data/lib/fbcrawl_colly/colly.rb +0 -50
- data/lib/fbcrawl_colly/ffi.rb +0 -17
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: eed929533973a523ed9cc5ac0cabf35559cb2e50e1c028a7ef2d8b90de15b58b
|
4
|
+
data.tar.gz: 385c7dcd852e31acdc3d591b323ba4eb2b8d979b89c8fe55a55e18b00fbbb691
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 16777a1a0ae7d48b2ce7ac3bd3573a351903321a009e81d649379aa8f508986ae4511d0ed85101c8605910d383dfa9ba8b7748d759ab05c7a7076e32c5cf49cd
|
7
|
+
data.tar.gz: d902d6226148cadd62d1e0c5a45419cfdd1343831bcfd5dced56bd298982a51d1da1835b048925b347351a01e63e875e8b295e568fa803b512c57183484fd2c6
|
data/.gitignore
CHANGED
data/Dockerfile
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
FROM golang:1.14-alpine
|
2
|
+
RUN apk add --no-cache git build-base tzdata
|
3
|
+
|
4
|
+
RUN mkdir -p /app
|
5
|
+
WORKDIR /app
|
6
|
+
ADD ./go.mod /app
|
7
|
+
ADD ./go.sum /app
|
8
|
+
ADD ./ /app
|
9
|
+
RUN go get
|
10
|
+
|
11
|
+
|
12
|
+
ENV PORT 3000
|
13
|
+
RUN go build -o server qnetwork.net/fbcrawl
|
14
|
+
ENTRYPOINT ["./server"]
|
data/Gemfile.lock
CHANGED
@@ -1,19 +1,21 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
fbcrawl-colly (0.
|
5
|
-
ffi
|
4
|
+
fbcrawl-colly (1.0.1)
|
6
5
|
google-protobuf
|
6
|
+
grpc
|
7
7
|
|
8
8
|
GEM
|
9
9
|
remote: https://rubygems.org/
|
10
10
|
specs:
|
11
|
-
|
12
|
-
|
11
|
+
google-protobuf (3.13.0)
|
12
|
+
googleapis-common-protos-types (1.0.5)
|
13
|
+
google-protobuf (~> 3.11)
|
14
|
+
grpc (1.30.2)
|
15
|
+
google-protobuf (~> 3.12)
|
16
|
+
googleapis-common-protos-types (~> 1.0)
|
13
17
|
minitest (5.14.1)
|
14
18
|
rake (12.3.3)
|
15
|
-
rake-compiler (1.1.1)
|
16
|
-
rake
|
17
19
|
|
18
20
|
PLATFORMS
|
19
21
|
ruby
|
@@ -22,7 +24,6 @@ DEPENDENCIES
|
|
22
24
|
fbcrawl-colly!
|
23
25
|
minitest (~> 5.0)
|
24
26
|
rake (~> 12.0)
|
25
|
-
rake-compiler
|
26
27
|
|
27
28
|
BUNDLED WITH
|
28
29
|
2.1.4
|
data/README.md
CHANGED
data/fbcrawl-colly.gemspec
CHANGED
@@ -25,12 +25,8 @@ Gem::Specification.new do |spec|
|
|
25
25
|
end
|
26
26
|
# spec.bindir = "exe"
|
27
27
|
# spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
28
|
-
spec.
|
29
|
-
'ext/fbcrawl_colly/extconf.rb'
|
30
|
-
]
|
31
|
-
spec.require_paths = ["lib"]
|
28
|
+
spec.require_paths = %w[lib lib/pb]
|
32
29
|
|
33
|
-
spec.add_runtime_dependency 'ffi'
|
34
30
|
spec.add_runtime_dependency 'google-protobuf'
|
35
|
-
spec.
|
31
|
+
spec.add_runtime_dependency 'grpc'
|
36
32
|
end
|
data/fbcrawl.proto
CHANGED
@@ -1,17 +1,92 @@
|
|
1
1
|
syntax = "proto3";
|
2
2
|
|
3
3
|
package fbcrawl_colly;
|
4
|
-
option go_package = "./fbcrawl;
|
4
|
+
option go_package = "./fbcrawl/pb;pb";
|
5
|
+
|
6
|
+
service Grpc {
|
7
|
+
// Sends a greeting
|
8
|
+
rpc Init (Empty) returns (Pointer) {}
|
9
|
+
rpc FreeColly (Pointer) returns (Empty) {}
|
10
|
+
rpc Login (LoginRequest) returns (LoginResponse) {}
|
11
|
+
rpc LoginWithCookies (LoginWithCookiesRequest) returns (Empty) {}
|
12
|
+
rpc FetchGroupInfo (FetchGroupInfoRequest) returns (FacebookGroup) {}
|
13
|
+
rpc FetchUserInfo (FetchUserInfoRequest) returns (FacebookUser) {}
|
14
|
+
rpc FetchGroupFeed (FetchGroupFeedRequest) returns (FacebookPostList) {}
|
15
|
+
rpc FetchPost (FetchPostRequest) returns (FacebookPost) {}
|
16
|
+
rpc FetchContentImages (FetchContentImagesRequest) returns (FacebookImageList) {}
|
17
|
+
rpc FetchImageUrl (FetchImageUrlRequest) returns (FacebookImage) {}
|
18
|
+
}
|
19
|
+
|
20
|
+
message Empty {
|
21
|
+
|
22
|
+
}
|
23
|
+
|
24
|
+
message Pointer {
|
25
|
+
int64 address = 1;
|
26
|
+
}
|
27
|
+
|
28
|
+
message LoginRequest {
|
29
|
+
Pointer pointer = 1;
|
30
|
+
string email = 2;
|
31
|
+
string password = 3;
|
32
|
+
string totp_secret = 4;
|
33
|
+
}
|
34
|
+
|
35
|
+
message LoginResponse {
|
36
|
+
string cookies = 1;
|
37
|
+
}
|
38
|
+
|
39
|
+
message LoginWithCookiesRequest {
|
40
|
+
Pointer pointer = 1;
|
41
|
+
string cookies = 2;
|
42
|
+
}
|
43
|
+
|
44
|
+
message FetchGroupInfoRequest {
|
45
|
+
Pointer pointer = 1;
|
46
|
+
string group_username = 2;
|
47
|
+
}
|
48
|
+
|
49
|
+
message FetchUserInfoRequest {
|
50
|
+
Pointer pointer = 1;
|
51
|
+
string username = 2;
|
52
|
+
}
|
53
|
+
|
54
|
+
message FetchGroupFeedRequest {
|
55
|
+
Pointer pointer = 1;
|
56
|
+
int64 group_id = 2;
|
57
|
+
string next_cursor = 3;
|
58
|
+
}
|
59
|
+
|
60
|
+
message FetchPostRequest {
|
61
|
+
Pointer pointer = 1;
|
62
|
+
int64 group_id = 2;
|
63
|
+
int64 post_id = 3;
|
64
|
+
string comment_next_cursor = 4;
|
65
|
+
}
|
66
|
+
|
67
|
+
message FetchContentImagesRequest {
|
68
|
+
Pointer pointer = 1;
|
69
|
+
int64 post_id = 2;
|
70
|
+
string next_cursor = 3;
|
71
|
+
}
|
72
|
+
|
73
|
+
message FetchImageUrlRequest {
|
74
|
+
Pointer pointer = 1;
|
75
|
+
int64 image_id = 2;
|
76
|
+
}
|
5
77
|
|
6
78
|
// The request message containing the user's name.
|
7
79
|
message FacebookGroup {
|
8
80
|
int64 id = 1;
|
9
81
|
string name = 2;
|
82
|
+
int64 member_count = 3;
|
10
83
|
}
|
11
84
|
|
12
85
|
message FacebookUser {
|
13
86
|
int64 id = 1;
|
14
87
|
string name = 2;
|
88
|
+
string username = 3;
|
89
|
+
int64 friend_count =4;
|
15
90
|
}
|
16
91
|
|
17
92
|
message FacebookPost {
|
@@ -19,12 +94,18 @@ message FacebookPost {
|
|
19
94
|
FacebookGroup group = 2;
|
20
95
|
FacebookUser user = 3;
|
21
96
|
string content = 4;
|
97
|
+
CommentList comments = 5;
|
22
98
|
string content_link = 6;
|
23
|
-
FacebookImage content_image = 8;
|
24
99
|
repeated FacebookImage content_images = 7;
|
25
|
-
|
100
|
+
FacebookImage content_image = 8;
|
26
101
|
int64 created_at = 9;
|
27
102
|
int64 reaction_count = 10;
|
103
|
+
int64 comment_count = 11;
|
104
|
+
}
|
105
|
+
|
106
|
+
message CommentList {
|
107
|
+
repeated FacebookComment comments = 5;
|
108
|
+
string next_cursor = 12;
|
28
109
|
}
|
29
110
|
|
30
111
|
message FacebookImage {
|
@@ -42,8 +123,10 @@ message FacebookComment {
|
|
42
123
|
|
43
124
|
message FacebookPostList {
|
44
125
|
repeated FacebookPost posts = 1;
|
126
|
+
string next_cursor = 2;
|
45
127
|
}
|
46
128
|
|
47
129
|
message FacebookImageList {
|
48
130
|
repeated FacebookImage images = 1;
|
131
|
+
string next_cursor = 2;
|
49
132
|
}
|
@@ -6,16 +6,18 @@ import (
|
|
6
6
|
"errors"
|
7
7
|
"fmt"
|
8
8
|
"github.com/PuerkitoBio/goquery"
|
9
|
-
"github.com/gocolly/colly"
|
10
|
-
"github.com/gocolly/colly/
|
11
|
-
"github.com/gocolly/colly/
|
9
|
+
"github.com/gocolly/colly/v2"
|
10
|
+
"github.com/gocolly/colly/v2/debug"
|
11
|
+
"github.com/gocolly/colly/v2/extensions"
|
12
|
+
"github.com/gocolly/colly/v2/storage"
|
12
13
|
"github.com/google/logger"
|
13
14
|
"github.com/olebedev/when"
|
14
15
|
"github.com/olebedev/when/rules/common"
|
15
16
|
"github.com/olebedev/when/rules/en"
|
16
17
|
"github.com/thoas/go-funk"
|
18
|
+
"github.com/xlzd/gotp"
|
17
19
|
"net/url"
|
18
|
-
"qnetwork.net/fbcrawl/fbcrawl"
|
20
|
+
"qnetwork.net/fbcrawl/fbcrawl/pb"
|
19
21
|
"regexp"
|
20
22
|
"strconv"
|
21
23
|
"strings"
|
@@ -33,7 +35,7 @@ type FbDataInsight struct {
|
|
33
35
|
FbDataPostContext `json:"post_context"`
|
34
36
|
}
|
35
37
|
type FbDataFt struct {
|
36
|
-
ContentOwnerIdNew
|
38
|
+
ContentOwnerIdNew json.Number `json:"content_owner_id_new"`
|
37
39
|
PhotoAttachmentsList []string `json:"photo_attachments_list"`
|
38
40
|
PhotoId int64 `json:"photo_id,string"`
|
39
41
|
PageId int64 `json:"page_id,string"`
|
@@ -42,7 +44,7 @@ type FbDataFt struct {
|
|
42
44
|
}
|
43
45
|
|
44
46
|
func sharedOnRequest(request *colly.Request) {
|
45
|
-
logger.Info("OnRequest")
|
47
|
+
logger.Info("OnRequest ", request.URL)
|
46
48
|
//request.Headers.Set("Host", "facebook.com")
|
47
49
|
request.Headers.Set("Accept-Language", "en-US,en;q=0.9")
|
48
50
|
request.Headers.Set("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
|
@@ -65,6 +67,7 @@ func setupSharedCollector(collector *colly.Collector) error {
|
|
65
67
|
collector.AllowURLRevisit = true
|
66
68
|
collector.OnRequest(sharedOnRequest)
|
67
69
|
collector.OnResponse(sharedOnResponse)
|
70
|
+
collector.SetDebugger(&debug.LogDebugger{})
|
68
71
|
collector.OnError(func(resp *colly.Response, errHttp error) {
|
69
72
|
err = errHttp
|
70
73
|
logger.Error("OnError", err)
|
@@ -106,26 +109,29 @@ func New() *Fbcolly {
|
|
106
109
|
return &f
|
107
110
|
}
|
108
111
|
|
109
|
-
func (f *Fbcolly) Login(email string, password string,
|
112
|
+
func (f *Fbcolly) Login(email string, password string, totpSecret string) (string, error) {
|
110
113
|
collector := f.collector.Clone()
|
111
114
|
err := setupSharedCollector(collector)
|
112
115
|
|
113
116
|
logger.Info("Login using email", email)
|
114
117
|
loggedIn := false
|
115
|
-
|
118
|
+
firstLogin := true
|
116
119
|
collector.OnHTML("#login_form", func(element *colly.HTMLElement) {
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
120
|
+
if firstLogin {
|
121
|
+
firstLogin = false
|
122
|
+
logger.Info("OnHTML login_form")
|
123
|
+
loginURL, err, reqMap := getForm(element, err)
|
124
|
+
if err != nil {
|
125
|
+
logger.Error(err)
|
126
|
+
return
|
127
|
+
}
|
128
|
+
reqMap["email"] = email
|
129
|
+
reqMap["pass"] = password
|
130
|
+
logger.Info("req map:", reqMap)
|
131
|
+
err = collector.Post(loginURL, reqMap)
|
132
|
+
if err != nil {
|
133
|
+
logger.Error("post err:", err)
|
134
|
+
}
|
129
135
|
}
|
130
136
|
})
|
131
137
|
|
@@ -152,9 +158,12 @@ func (f *Fbcolly) Login(email string, password string, otp string) (string, erro
|
|
152
158
|
//logger.Info("Please input OTP")
|
153
159
|
//reader := bufio.NewReader(os.Stdin)
|
154
160
|
//code, _ := reader.ReadString('\n')
|
155
|
-
|
156
|
-
|
157
|
-
|
161
|
+
if len(totpSecret) > 0 {
|
162
|
+
code := gotp.NewDefaultTOTP(totpSecret).Now()
|
163
|
+
reqMap["approvals_code"] = code
|
164
|
+
shouldSubmit = true
|
165
|
+
}
|
166
|
+
|
158
167
|
} else {
|
159
168
|
logger.Info("OnHTML Only Continue checkpoint")
|
160
169
|
|
@@ -188,68 +197,152 @@ func (f *Fbcolly) Login(email string, password string, otp string) (string, erro
|
|
188
197
|
|
189
198
|
}
|
190
199
|
|
191
|
-
func (f *Fbcolly) FetchGroupFeed(groupId int64) (error, *
|
200
|
+
func (f *Fbcolly) FetchGroupFeed(groupId int64, nextCursor string) (error, *pb.FacebookPostList) {
|
192
201
|
collector := f.collector.Clone()
|
193
202
|
err := setupSharedCollector(collector)
|
194
|
-
|
195
|
-
var result []*fbcrawl.FacebookPost
|
203
|
+
result := pb.FacebookPostList{Posts: []*pb.FacebookPost{}}
|
196
204
|
|
197
205
|
collector.OnHTML("#m_group_stories_container > :last-child a", func(element *colly.HTMLElement) {
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
206
|
+
result.NextCursor = "http://mbasic.facebook.com" + element.Attr("href")
|
207
|
+
})
|
208
|
+
collector.OnHTML("#m_group_stories_container div[role=\"article\"]", func(element *colly.HTMLElement) {
|
209
|
+
dataElement := element
|
210
|
+
post := &pb.FacebookPost{}
|
211
|
+
var fbDataFt FbDataFt
|
212
|
+
jsonData := dataElement.Attr("data-ft")
|
213
|
+
|
214
|
+
logger.Info(jsonData)
|
215
|
+
err = json.Unmarshal([]byte(jsonData), &fbDataFt)
|
216
|
+
if err != nil {
|
217
|
+
logger.Error(err)
|
218
|
+
return
|
219
|
+
}
|
220
|
+
logger.Info("Post ", fbDataFt)
|
221
|
+
post.Id = fbDataFt.TopLevelPostId
|
222
|
+
post.Group = &pb.FacebookGroup{Id: fbDataFt.PageId, Name: dataElement.DOM.Find("h3 strong:nth-child(2) a").Text()}
|
223
|
+
userId, _ := fbDataFt.ContentOwnerIdNew.Int64()
|
224
|
+
post.User = &pb.FacebookUser{
|
225
|
+
Id: userId,
|
226
|
+
Name: dataElement.DOM.Find("h3 strong:nth-child(1) a").Text(),
|
202
227
|
}
|
228
|
+
post.CreatedAt = fbDataFt.PageInsights[strconv.FormatInt(fbDataFt.PageId, 10)].PublishTime
|
229
|
+
//Content
|
230
|
+
|
231
|
+
//NO BACKGROUND TEXT ONLY
|
232
|
+
post.Content = strings.Join(dataElement.DOM.Find("p").Map(func(i int, selection *goquery.Selection) string {
|
233
|
+
return selection.Text()
|
234
|
+
}), "\n")
|
235
|
+
|
236
|
+
if len(post.Content) == 0 {
|
237
|
+
// TEXT WITH BACKGROUND
|
238
|
+
post.Content = dataElement.DOM.Find("div[style*=\"background-image:url\"]").Text()
|
239
|
+
}
|
240
|
+
|
241
|
+
post.ContentLink = getUrlFromRedirectHref(dataElement.DOM.Find("a[href*=\"https://lm.facebook.com/l.php\"]").AttrOr("href", ""))
|
242
|
+
post.ReactionCount = getNumberFromText(element.DOM.Find("span[id*=\"like_\"]").Text())
|
243
|
+
post.CommentCount = getNumberFromText(element.DOM.Find("span[id*=\"like_\"] ~ a").Text())
|
244
|
+
post.ContentImages = (funk.Map(fbDataFt.PhotoAttachmentsList, func(id string) *pb.FacebookImage {
|
245
|
+
i, _ := strconv.ParseInt(id, 10, 64)
|
246
|
+
return &pb.FacebookImage{
|
247
|
+
Id: i,
|
248
|
+
}
|
249
|
+
})).([]*pb.FacebookImage)
|
250
|
+
|
251
|
+
if fbDataFt.PhotoId > 0 {
|
252
|
+
post.ContentImage = &pb.FacebookImage{Id: fbDataFt.PhotoId}
|
253
|
+
}
|
254
|
+
result.Posts = append(result.Posts, post)
|
203
255
|
})
|
256
|
+
if len(nextCursor) > 0 {
|
257
|
+
err = collector.Visit(nextCursor)
|
258
|
+
} else {
|
259
|
+
err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/groups/%d", groupId))
|
260
|
+
}
|
204
261
|
|
205
|
-
|
206
|
-
logger.
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
262
|
+
if err != nil {
|
263
|
+
logger.Error("crawl by colly err:", err)
|
264
|
+
}
|
265
|
+
return err, &result
|
266
|
+
}
|
267
|
+
|
268
|
+
func (f *Fbcolly) FetchUserInfo(userIdOrUsername string) (error, *pb.FacebookUser) {
|
269
|
+
collector := f.collector.Clone()
|
270
|
+
err := setupSharedCollector(collector)
|
271
|
+
|
272
|
+
result := &pb.FacebookUser{}
|
273
|
+
|
274
|
+
collector.OnHTML("a[href*=\"lst=\"]", func(element *colly.HTMLElement) {
|
275
|
+
parsed, _ := url.Parse(element.Attr("href"))
|
276
|
+
result.Username = strings.Split(parsed.Path[1:], "/")[0]
|
277
|
+
result.Id = getUserIdFromCommentHref(parsed.Query().Get("lst"))
|
278
|
+
})
|
279
|
+
|
280
|
+
collector.OnHTML("a[href*=\"/friends\"]", func(element *colly.HTMLElement) {
|
281
|
+
result.FriendCount = getNumberFromText(element.Text)
|
282
|
+
})
|
283
|
+
|
284
|
+
collector.OnHTML("#objects_container", func(element *colly.HTMLElement) {
|
285
|
+
result.Name = element.DOM.Find("strong").First().Text()
|
286
|
+
})
|
287
|
+
|
288
|
+
err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/%s", userIdOrUsername))
|
289
|
+
if err != nil {
|
290
|
+
logger.Error("crawl by colly err:", err)
|
291
|
+
}
|
292
|
+
return err, result
|
293
|
+
}
|
294
|
+
|
295
|
+
func (f *Fbcolly) FetchGroupInfo(groupIdOrUsername string) (error, *pb.FacebookGroup) {
|
296
|
+
collector := f.collector.Clone()
|
297
|
+
err := setupSharedCollector(collector)
|
298
|
+
result := &pb.FacebookGroup{}
|
299
|
+
|
300
|
+
collector.OnHTML("a[href=\"#groupMenuBottom\"] h1", func(element *colly.HTMLElement) {
|
301
|
+
result.Name = element.Text
|
302
|
+
})
|
303
|
+
collector.OnHTML("a[href*=\"view=member\"]", func(element *colly.HTMLElement) {
|
304
|
+
result.Id = getNumberFromText(element.Attr("href"))
|
305
|
+
result.MemberCount, _ = strconv.ParseInt(element.DOM.Closest("tr").Find("td:last-child").Text(), 10, 64)
|
214
306
|
})
|
215
307
|
|
216
|
-
err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/groups/%
|
308
|
+
err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/groups/%s?view=info", groupIdOrUsername))
|
217
309
|
if err != nil {
|
218
310
|
logger.Error("crawl by colly err:", err)
|
219
311
|
}
|
220
|
-
return err,
|
312
|
+
return err, result
|
221
313
|
}
|
222
314
|
|
223
|
-
func (f *Fbcolly) FetchContentImages(postId int64) (error, *
|
315
|
+
func (f *Fbcolly) FetchContentImages(postId int64, nextCursor string) (error, *pb.FacebookImageList) {
|
224
316
|
collector := f.collector.Clone()
|
225
317
|
err := setupSharedCollector(collector)
|
226
|
-
|
227
|
-
var result []*fbcrawl.FacebookImage
|
318
|
+
result := pb.FacebookImageList{Images: []*pb.FacebookImage{}}
|
228
319
|
|
229
320
|
collector.OnHTML("a[href*=\"/media/set/\"]", func(element *colly.HTMLElement) {
|
230
|
-
|
231
|
-
logger.Info("Will fetch page", currentPage)
|
232
|
-
err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
|
321
|
+
result.NextCursor = "http://mbasic.facebook.com" + element.Attr("href")
|
233
322
|
})
|
234
323
|
|
235
324
|
collector.OnHTML("a[href*=\"/photo.php\"]", func(element *colly.HTMLElement) {
|
236
|
-
result = append(result, &
|
325
|
+
result.Images = append(result.Images, &pb.FacebookImage{
|
237
326
|
Id: getImageIdFromHref(element.Attr("href")),
|
238
327
|
})
|
239
328
|
//f.detailCollector.Visit(url)
|
240
329
|
})
|
330
|
+
if len(nextCursor) > 0 {
|
331
|
+
err = collector.Visit(nextCursor)
|
332
|
+
} else {
|
333
|
+
err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/media/set/?set=pcb.%d", postId))
|
334
|
+
}
|
241
335
|
|
242
|
-
err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/media/set/?set=pcb.%d", postId))
|
243
336
|
if err != nil {
|
244
337
|
logger.Error("crawl by colly err:", err)
|
245
338
|
}
|
246
|
-
return err, &
|
339
|
+
return err, &result
|
247
340
|
}
|
248
341
|
|
249
|
-
func (f *Fbcolly) FetchImageUrl(imageId int64) (error, *
|
342
|
+
func (f *Fbcolly) FetchImageUrl(imageId int64) (error, *pb.FacebookImage) {
|
250
343
|
collector := f.collector.Clone()
|
251
344
|
err := setupSharedCollector(collector)
|
252
|
-
result :=
|
345
|
+
result := pb.FacebookImage{Id: imageId}
|
253
346
|
|
254
347
|
collector.OnHTML("a", func(element *colly.HTMLElement) {
|
255
348
|
result.Url = element.Attr("href")
|
@@ -262,11 +355,10 @@ func (f *Fbcolly) FetchImageUrl(imageId int64) (error, *fbcrawl.FacebookImage) {
|
|
262
355
|
return err, &result
|
263
356
|
}
|
264
357
|
|
265
|
-
func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *
|
358
|
+
func (f *Fbcolly) FetchPost(groupId int64, postId int64, commentNextCursor string) (error, *pb.FacebookPost) {
|
266
359
|
collector := f.collector.Clone()
|
267
360
|
err := setupSharedCollector(collector)
|
268
|
-
post := &
|
269
|
-
commentPaging := 0
|
361
|
+
post := &pb.FacebookPost{Comments: &pb.CommentList{Comments: []*pb.FacebookComment{}}}
|
270
362
|
collector.OnHTML("#m_story_permalink_view", func(element *colly.HTMLElement) {
|
271
363
|
dataElement := element.DOM.Find("div[data-ft]")
|
272
364
|
if dataElement.Length() > 0 {
|
@@ -281,9 +373,10 @@ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.Facebo
|
|
281
373
|
}
|
282
374
|
logger.Info("Post ", result)
|
283
375
|
post.Id = result.TopLevelPostId
|
284
|
-
post.Group = &
|
285
|
-
|
286
|
-
|
376
|
+
post.Group = &pb.FacebookGroup{Id: result.PageId, Name: dataElement.Find("h3 strong:last-child a").Text()}
|
377
|
+
userId, _ := result.ContentOwnerIdNew.Int64()
|
378
|
+
post.User = &pb.FacebookUser{
|
379
|
+
Id: userId,
|
287
380
|
Name: dataElement.Find("h3 strong:first-child a").Text(),
|
288
381
|
}
|
289
382
|
post.CreatedAt = result.PageInsights[strconv.FormatInt(result.PageId, 10)].PublishTime
|
@@ -300,21 +393,17 @@ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.Facebo
|
|
300
393
|
}
|
301
394
|
|
302
395
|
post.ContentLink = getUrlFromRedirectHref(dataElement.Find("a[href*=\"https://lm.facebook.com/l.php\"]").AttrOr("href", ""))
|
303
|
-
post.ReactionCount =
|
304
|
-
post.ContentImages = (funk.Map(result.PhotoAttachmentsList, func(id string) *
|
396
|
+
post.ReactionCount = getNumberFromText(element.DOM.Find("div[id*=\"sentence_\"]").Text())
|
397
|
+
post.ContentImages = (funk.Map(result.PhotoAttachmentsList, func(id string) *pb.FacebookImage {
|
305
398
|
i, _ := strconv.ParseInt(id, 10, 64)
|
306
|
-
return &
|
399
|
+
return &pb.FacebookImage{
|
307
400
|
Id: i,
|
308
401
|
}
|
309
|
-
})).([]*
|
402
|
+
})).([]*pb.FacebookImage)
|
310
403
|
|
311
404
|
if result.PhotoId > 0 {
|
312
|
-
post.ContentImage = &
|
405
|
+
post.ContentImage = &pb.FacebookImage{Id: result.PhotoId}
|
313
406
|
}
|
314
|
-
|
315
|
-
logger.Info("content", strings.Join(dataElement.Find("p").Map(func(i int, selection *goquery.Selection) string {
|
316
|
-
return selection.Text()
|
317
|
-
}), "\n"))
|
318
407
|
}
|
319
408
|
|
320
409
|
//Comment
|
@@ -323,12 +412,13 @@ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.Facebo
|
|
323
412
|
commentId, _ := strconv.ParseInt(selection.AttrOr("id", ""), 10, 64)
|
324
413
|
logger.Info("comment", commentId)
|
325
414
|
createdAtWhenResult, _ := f.w.Parse(selection.Find("abbr").Text(), time.Now())
|
326
|
-
|
415
|
+
parsed, _ := url.Parse(selection.Find("h3 > a").AttrOr("href", ""))
|
416
|
+
post.Comments.Comments = append(post.Comments.Comments, &pb.FacebookComment{
|
327
417
|
Id: commentId,
|
328
|
-
Post: &
|
329
|
-
User: &
|
330
|
-
|
331
|
-
Name:
|
418
|
+
Post: &pb.FacebookPost{Id: post.Id},
|
419
|
+
User: &pb.FacebookUser{
|
420
|
+
Username: parsed.Path[1:],
|
421
|
+
Name: selection.Find("h3 > a").Text(),
|
332
422
|
},
|
333
423
|
Content: selection.Find("h3 + div").Text(),
|
334
424
|
CreatedAt: createdAtWhenResult.Time.Unix(),
|
@@ -339,14 +429,14 @@ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.Facebo
|
|
339
429
|
})
|
340
430
|
|
341
431
|
collector.OnHTML("div[id*=\"see_prev_\"] > a", func(element *colly.HTMLElement) {
|
342
|
-
|
343
|
-
logger.Info("Comment paging", commentPaging)
|
344
|
-
err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
|
345
|
-
commentPaging = commentPaging + 1
|
346
|
-
}
|
432
|
+
post.Comments.NextCursor = "http://mbasic.facebook.com" + element.Attr("href")
|
347
433
|
})
|
434
|
+
if len(commentNextCursor) > 0 {
|
435
|
+
err = collector.Visit(commentNextCursor)
|
436
|
+
} else {
|
437
|
+
err = collector.Visit(fmt.Sprintf("http://mbasic.facebook.com/groups/%d?view=permalink&id=%d&_rdr", groupId, postId))
|
438
|
+
}
|
348
439
|
|
349
|
-
err = collector.Visit(fmt.Sprintf("http://mbasic.facebook.com/groups/%d?view=permalink&id=%d&_rdr", groupId, postId))
|
350
440
|
return err, post
|
351
441
|
}
|
352
442
|
|
@@ -360,8 +450,12 @@ func (f *Fbcolly) LoginWithCookies(cookies string) error {
|
|
360
450
|
//}
|
361
451
|
|
362
452
|
func getUserIdFromCommentHref(href string) int64 {
|
363
|
-
|
364
|
-
|
453
|
+
match := regexp.MustCompile("\\d+:(\\d+)\\d+").FindStringSubmatch(href)
|
454
|
+
if len(match) > 0 {
|
455
|
+
id, _ := strconv.ParseInt(match[1], 10, 64)
|
456
|
+
return id
|
457
|
+
}
|
458
|
+
return 0
|
365
459
|
}
|
366
460
|
|
367
461
|
func getUrlFromRedirectHref(href string) string {
|
@@ -375,11 +469,20 @@ func getImageIdFromHref(href string) int64 {
|
|
375
469
|
return i
|
376
470
|
}
|
377
471
|
|
378
|
-
func
|
472
|
+
func getNumberFromText(text string) int64 {
|
379
473
|
logger.Error("reaction", text)
|
380
474
|
if len(text) > 0 {
|
381
|
-
|
382
|
-
|
475
|
+
match := regexp.MustCompile("(\\d+)\\s?([km]?)").FindStringSubmatch(text)
|
476
|
+
if len(match) > 0 {
|
477
|
+
count, _ := strconv.ParseInt(match[1], 10, 64)
|
478
|
+
switch match[2] {
|
479
|
+
case "k":
|
480
|
+
count *= 1000
|
481
|
+
case "m":
|
482
|
+
count *= 1000000
|
483
|
+
}
|
484
|
+
return count
|
485
|
+
}
|
383
486
|
}
|
384
487
|
return 0
|
385
488
|
}
|