fbcrawl-colly 0.2.1 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +0 -2
- data/Dockerfile +14 -0
- data/Gemfile.lock +8 -7
- data/README.md +1 -1
- data/fbcrawl-colly.gemspec +2 -6
- data/fbcrawl.proto +78 -3
- data/{fbcolly → fbcrawl}/fbcolly.go +153 -80
- data/fbcrawl/pb/fbcrawl.pb.go +1647 -0
- data/fbcrawl/pb/fbcrawl_grpc.pb.go +380 -0
- data/go.mod +5 -11
- data/go.sum +21 -10
- data/lib/fbcrawl-colly.rb +1 -4
- data/lib/fbcrawl_colly.rb +5 -0
- data/lib/fbcrawl_colly/client.rb +46 -0
- data/lib/fbcrawl_colly/version.rb +1 -1
- data/lib/pb/fbcrawl_pb.rb +117 -0
- data/lib/pb/fbcrawl_services_pb.rb +31 -0
- data/main.go +76 -71
- metadata +14 -26
- data/ext/fbcrawl_colly/.gitignore +0 -2
- data/ext/fbcrawl_colly/Makefile +0 -6
- data/ext/fbcrawl_colly/extconf.rb +0 -6
- data/lib/fbcrawl_colly/colly.rb +0 -50
- data/lib/fbcrawl_colly/ffi.rb +0 -17
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 00d52fcc50bd651adaa80cf6102d7cc3997a07605ebb91fd33006b6e535ed6fc
|
4
|
+
data.tar.gz: 24388fa7c98d6dfbe58ba195c330ea3c363f6ef8e586873dd9e485ad29907a6c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 15622279b09030f929218482add87878b58fdcb1e89e1ab8a1124531435707dbe26367fee75204a55fc88f8486c47cbb6ddfdb86035aa24694b1b7d6639aaecc
|
7
|
+
data.tar.gz: df50b30ad0234c824505cca54fdf33d0be54c4725c684f0317b1eba81f89d22d7b434bec4e9373b3d43dd830dc17693ac2fb785e75cefc71c6cbeefc44da2616
|
data/.gitignore
CHANGED
data/Dockerfile
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
FROM golang:1.14-alpine
|
2
|
+
RUN apk add --no-cache git build-base tzdata
|
3
|
+
|
4
|
+
RUN mkdir -p /app
|
5
|
+
WORKDIR /app
|
6
|
+
ADD ./go.mod /app
|
7
|
+
ADD ./go.sum /app
|
8
|
+
ADD ./ /app
|
9
|
+
RUN go get
|
10
|
+
|
11
|
+
|
12
|
+
ENV PORT 3000
|
13
|
+
RUN go build -o server qnetwork.net/fbcrawl
|
14
|
+
ENTRYPOINT ["./server"]
|
data/Gemfile.lock
CHANGED
@@ -1,19 +1,21 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
fbcrawl-colly (0.
|
5
|
-
ffi
|
4
|
+
fbcrawl-colly (1.0.0)
|
6
5
|
google-protobuf
|
6
|
+
grpc
|
7
7
|
|
8
8
|
GEM
|
9
9
|
remote: https://rubygems.org/
|
10
10
|
specs:
|
11
|
-
|
12
|
-
|
11
|
+
google-protobuf (3.13.0-universal-darwin)
|
12
|
+
googleapis-common-protos-types (1.0.5)
|
13
|
+
google-protobuf (~> 3.11)
|
14
|
+
grpc (1.30.2-universal-darwin)
|
15
|
+
google-protobuf (~> 3.12)
|
16
|
+
googleapis-common-protos-types (~> 1.0)
|
13
17
|
minitest (5.14.1)
|
14
18
|
rake (12.3.3)
|
15
|
-
rake-compiler (1.1.1)
|
16
|
-
rake
|
17
19
|
|
18
20
|
PLATFORMS
|
19
21
|
ruby
|
@@ -22,7 +24,6 @@ DEPENDENCIES
|
|
22
24
|
fbcrawl-colly!
|
23
25
|
minitest (~> 5.0)
|
24
26
|
rake (~> 12.0)
|
25
|
-
rake-compiler
|
26
27
|
|
27
28
|
BUNDLED WITH
|
28
29
|
2.1.4
|
data/README.md
CHANGED
data/fbcrawl-colly.gemspec
CHANGED
@@ -25,12 +25,8 @@ Gem::Specification.new do |spec|
|
|
25
25
|
end
|
26
26
|
# spec.bindir = "exe"
|
27
27
|
# spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
28
|
-
spec.
|
29
|
-
'ext/fbcrawl_colly/extconf.rb'
|
30
|
-
]
|
31
|
-
spec.require_paths = ["lib"]
|
28
|
+
spec.require_paths = %w[lib lib/pb]
|
32
29
|
|
33
|
-
spec.add_runtime_dependency 'ffi'
|
34
30
|
spec.add_runtime_dependency 'google-protobuf'
|
35
|
-
spec.
|
31
|
+
spec.add_runtime_dependency 'grpc'
|
36
32
|
end
|
data/fbcrawl.proto
CHANGED
@@ -1,12 +1,79 @@
|
|
1
1
|
syntax = "proto3";
|
2
2
|
|
3
3
|
package fbcrawl_colly;
|
4
|
-
option go_package = "./fbcrawl;
|
4
|
+
option go_package = "./fbcrawl/pb;pb";
|
5
|
+
|
6
|
+
service Grpc {
|
7
|
+
// Sends a greeting
|
8
|
+
rpc Init (Empty) returns (Pointer) {}
|
9
|
+
rpc FreeColly (Pointer) returns (Empty) {}
|
10
|
+
rpc Login (LoginRequest) returns (LoginResponse) {}
|
11
|
+
rpc LoginWithCookies (LoginWithCookiesRequest) returns (Empty) {}
|
12
|
+
rpc FetchGroupInfo (FetchGroupInfoRequest) returns (FacebookGroup) {}
|
13
|
+
rpc FetchGroupFeed (FetchGroupFeedRequest) returns (FacebookPostList) {}
|
14
|
+
rpc FetchPost (FetchPostRequest) returns (FacebookPost) {}
|
15
|
+
rpc FetchContentImages (FetchContentImagesRequest) returns (FacebookImageList) {}
|
16
|
+
rpc FetchImageUrl (FetchImageUrlRequest) returns (FacebookImage) {}
|
17
|
+
}
|
18
|
+
|
19
|
+
message Empty {
|
20
|
+
|
21
|
+
}
|
22
|
+
|
23
|
+
message Pointer {
|
24
|
+
int64 address = 1;
|
25
|
+
}
|
26
|
+
|
27
|
+
message LoginRequest {
|
28
|
+
Pointer pointer = 1;
|
29
|
+
string email = 2;
|
30
|
+
string password = 3;
|
31
|
+
string totp_secret = 4;
|
32
|
+
}
|
33
|
+
|
34
|
+
message LoginResponse {
|
35
|
+
string cookies = 1;
|
36
|
+
}
|
37
|
+
|
38
|
+
message LoginWithCookiesRequest {
|
39
|
+
Pointer pointer = 1;
|
40
|
+
string cookies = 2;
|
41
|
+
}
|
42
|
+
|
43
|
+
message FetchGroupInfoRequest {
|
44
|
+
Pointer pointer = 1;
|
45
|
+
string group_username = 2;
|
46
|
+
}
|
47
|
+
|
48
|
+
message FetchGroupFeedRequest {
|
49
|
+
Pointer pointer = 1;
|
50
|
+
int64 group_id = 2;
|
51
|
+
string next_cursor = 3;
|
52
|
+
}
|
53
|
+
|
54
|
+
message FetchPostRequest {
|
55
|
+
Pointer pointer = 1;
|
56
|
+
int64 group_id = 2;
|
57
|
+
int64 post_id = 3;
|
58
|
+
string comment_next_cursor = 4;
|
59
|
+
}
|
60
|
+
|
61
|
+
message FetchContentImagesRequest {
|
62
|
+
Pointer pointer = 1;
|
63
|
+
int64 post_id = 2;
|
64
|
+
string next_cursor = 3;
|
65
|
+
}
|
66
|
+
|
67
|
+
message FetchImageUrlRequest {
|
68
|
+
Pointer pointer = 1;
|
69
|
+
int64 image_id = 2;
|
70
|
+
}
|
5
71
|
|
6
72
|
// The request message containing the user's name.
|
7
73
|
message FacebookGroup {
|
8
74
|
int64 id = 1;
|
9
75
|
string name = 2;
|
76
|
+
int64 member_count = 3;
|
10
77
|
}
|
11
78
|
|
12
79
|
message FacebookUser {
|
@@ -19,12 +86,18 @@ message FacebookPost {
|
|
19
86
|
FacebookGroup group = 2;
|
20
87
|
FacebookUser user = 3;
|
21
88
|
string content = 4;
|
89
|
+
CommentList comments = 5;
|
22
90
|
string content_link = 6;
|
23
|
-
FacebookImage content_image = 8;
|
24
91
|
repeated FacebookImage content_images = 7;
|
25
|
-
|
92
|
+
FacebookImage content_image = 8;
|
26
93
|
int64 created_at = 9;
|
27
94
|
int64 reaction_count = 10;
|
95
|
+
int64 comment_count = 11;
|
96
|
+
}
|
97
|
+
|
98
|
+
message CommentList {
|
99
|
+
repeated FacebookComment comments = 5;
|
100
|
+
string next_cursor = 12;
|
28
101
|
}
|
29
102
|
|
30
103
|
message FacebookImage {
|
@@ -42,8 +115,10 @@ message FacebookComment {
|
|
42
115
|
|
43
116
|
message FacebookPostList {
|
44
117
|
repeated FacebookPost posts = 1;
|
118
|
+
string next_cursor = 2;
|
45
119
|
}
|
46
120
|
|
47
121
|
message FacebookImageList {
|
48
122
|
repeated FacebookImage images = 1;
|
123
|
+
string next_cursor = 2;
|
49
124
|
}
|
@@ -6,16 +6,18 @@ import (
|
|
6
6
|
"errors"
|
7
7
|
"fmt"
|
8
8
|
"github.com/PuerkitoBio/goquery"
|
9
|
-
"github.com/gocolly/colly"
|
10
|
-
"github.com/gocolly/colly/
|
11
|
-
"github.com/gocolly/colly/
|
9
|
+
"github.com/gocolly/colly/v2"
|
10
|
+
"github.com/gocolly/colly/v2/debug"
|
11
|
+
"github.com/gocolly/colly/v2/extensions"
|
12
|
+
"github.com/gocolly/colly/v2/storage"
|
12
13
|
"github.com/google/logger"
|
13
14
|
"github.com/olebedev/when"
|
14
15
|
"github.com/olebedev/when/rules/common"
|
15
16
|
"github.com/olebedev/when/rules/en"
|
16
17
|
"github.com/thoas/go-funk"
|
18
|
+
"github.com/xlzd/gotp"
|
17
19
|
"net/url"
|
18
|
-
"qnetwork.net/fbcrawl/fbcrawl"
|
20
|
+
"qnetwork.net/fbcrawl/fbcrawl/pb"
|
19
21
|
"regexp"
|
20
22
|
"strconv"
|
21
23
|
"strings"
|
@@ -42,7 +44,7 @@ type FbDataFt struct {
|
|
42
44
|
}
|
43
45
|
|
44
46
|
func sharedOnRequest(request *colly.Request) {
|
45
|
-
logger.Info("OnRequest")
|
47
|
+
logger.Info("OnRequest ", request.URL)
|
46
48
|
//request.Headers.Set("Host", "facebook.com")
|
47
49
|
request.Headers.Set("Accept-Language", "en-US,en;q=0.9")
|
48
50
|
request.Headers.Set("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
|
@@ -62,9 +64,10 @@ func sharedOnRequest(request *colly.Request) {
|
|
62
64
|
func setupSharedCollector(collector *colly.Collector) error {
|
63
65
|
var err error
|
64
66
|
extensions.Referer(collector)
|
65
|
-
|
67
|
+
collector.AllowURLRevisit = true
|
66
68
|
collector.OnRequest(sharedOnRequest)
|
67
69
|
collector.OnResponse(sharedOnResponse)
|
70
|
+
collector.SetDebugger(&debug.LogDebugger{})
|
68
71
|
collector.OnError(func(resp *colly.Response, errHttp error) {
|
69
72
|
err = errHttp
|
70
73
|
logger.Error("OnError", err)
|
@@ -106,26 +109,29 @@ func New() *Fbcolly {
|
|
106
109
|
return &f
|
107
110
|
}
|
108
111
|
|
109
|
-
func (f *Fbcolly) Login(email string, password string,
|
112
|
+
func (f *Fbcolly) Login(email string, password string, totpSecret string) (string, error) {
|
110
113
|
collector := f.collector.Clone()
|
111
114
|
err := setupSharedCollector(collector)
|
112
115
|
|
113
116
|
logger.Info("Login using email", email)
|
114
117
|
loggedIn := false
|
115
|
-
|
118
|
+
firstLogin := true
|
116
119
|
collector.OnHTML("#login_form", func(element *colly.HTMLElement) {
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
120
|
+
if firstLogin {
|
121
|
+
firstLogin = false
|
122
|
+
logger.Info("OnHTML login_form")
|
123
|
+
loginURL, err, reqMap := getForm(element, err)
|
124
|
+
if err != nil {
|
125
|
+
logger.Error(err)
|
126
|
+
return
|
127
|
+
}
|
128
|
+
reqMap["email"] = email
|
129
|
+
reqMap["pass"] = password
|
130
|
+
logger.Info("req map:", reqMap)
|
131
|
+
err = collector.Post(loginURL, reqMap)
|
132
|
+
if err != nil {
|
133
|
+
logger.Error("post err:", err)
|
134
|
+
}
|
129
135
|
}
|
130
136
|
})
|
131
137
|
|
@@ -152,9 +158,12 @@ func (f *Fbcolly) Login(email string, password string, otp string) (string, erro
|
|
152
158
|
//logger.Info("Please input OTP")
|
153
159
|
//reader := bufio.NewReader(os.Stdin)
|
154
160
|
//code, _ := reader.ReadString('\n')
|
155
|
-
|
156
|
-
|
157
|
-
|
161
|
+
if len(totpSecret) > 0 {
|
162
|
+
code := gotp.NewDefaultTOTP(totpSecret).Now()
|
163
|
+
reqMap["approvals_code"] = code
|
164
|
+
shouldSubmit = true
|
165
|
+
}
|
166
|
+
|
158
167
|
} else {
|
159
168
|
logger.Info("OnHTML Only Continue checkpoint")
|
160
169
|
|
@@ -188,68 +197,124 @@ func (f *Fbcolly) Login(email string, password string, otp string) (string, erro
|
|
188
197
|
|
189
198
|
}
|
190
199
|
|
191
|
-
func (f *Fbcolly) FetchGroupFeed(groupId int64) (error, *
|
200
|
+
func (f *Fbcolly) FetchGroupFeed(groupId int64, nextCursor string) (error, *pb.FacebookPostList) {
|
192
201
|
collector := f.collector.Clone()
|
193
202
|
err := setupSharedCollector(collector)
|
194
|
-
|
195
|
-
var result []*fbcrawl.FacebookPost
|
203
|
+
result := pb.FacebookPostList{Posts: []*pb.FacebookPost{}}
|
196
204
|
|
197
205
|
collector.OnHTML("#m_group_stories_container > :last-child a", func(element *colly.HTMLElement) {
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
206
|
+
result.NextCursor = "http://mbasic.facebook.com" + element.Attr("href")
|
207
|
+
})
|
208
|
+
collector.OnHTML("div[role=\"article\"]", func(element *colly.HTMLElement) {
|
209
|
+
dataElement := element
|
210
|
+
post := &pb.FacebookPost{}
|
211
|
+
var fbDataFt FbDataFt
|
212
|
+
jsonData := dataElement.Attr("data-ft")
|
213
|
+
|
214
|
+
logger.Info(jsonData)
|
215
|
+
err = json.Unmarshal([]byte(jsonData), &fbDataFt)
|
216
|
+
if err != nil {
|
217
|
+
logger.Error(err)
|
218
|
+
return
|
219
|
+
}
|
220
|
+
logger.Info("Post ", fbDataFt)
|
221
|
+
post.Id = fbDataFt.TopLevelPostId
|
222
|
+
post.Group = &pb.FacebookGroup{Id: fbDataFt.PageId, Name: dataElement.DOM.Find("h3 strong:nth-child(2) a").Text()}
|
223
|
+
post.User = &pb.FacebookUser{
|
224
|
+
Id: fbDataFt.ContentOwnerIdNew,
|
225
|
+
Name: dataElement.DOM.Find("h3 strong:nth-child(1) a").Text(),
|
226
|
+
}
|
227
|
+
post.CreatedAt = fbDataFt.PageInsights[strconv.FormatInt(fbDataFt.PageId, 10)].PublishTime
|
228
|
+
//Content
|
229
|
+
|
230
|
+
//NO BACKGROUND TEXT ONLY
|
231
|
+
post.Content = strings.Join(dataElement.DOM.Find("p").Map(func(i int, selection *goquery.Selection) string {
|
232
|
+
return selection.Text()
|
233
|
+
}), "\n")
|
234
|
+
|
235
|
+
if len(post.Content) == 0 {
|
236
|
+
// TEXT WITH BACKGROUND
|
237
|
+
post.Content = dataElement.DOM.Find("div[style*=\"background-image:url\"]").Text()
|
202
238
|
}
|
239
|
+
|
240
|
+
post.ContentLink = getUrlFromRedirectHref(dataElement.DOM.Find("a[href*=\"https://lm.facebook.com/l.php\"]").AttrOr("href", ""))
|
241
|
+
post.ReactionCount = getNumberFromText(element.DOM.Find("span[id*=\"like_\"]").Text())
|
242
|
+
post.CommentCount = getNumberFromText(element.DOM.Find("span[id*=\"like_\"] ~ a").Text())
|
243
|
+
post.ContentImages = (funk.Map(fbDataFt.PhotoAttachmentsList, func(id string) *pb.FacebookImage {
|
244
|
+
i, _ := strconv.ParseInt(id, 10, 64)
|
245
|
+
return &pb.FacebookImage{
|
246
|
+
Id: i,
|
247
|
+
}
|
248
|
+
})).([]*pb.FacebookImage)
|
249
|
+
|
250
|
+
if fbDataFt.PhotoId > 0 {
|
251
|
+
post.ContentImage = &pb.FacebookImage{Id: fbDataFt.PhotoId}
|
252
|
+
}
|
253
|
+
result.Posts = append(result.Posts, post)
|
203
254
|
})
|
255
|
+
if len(nextCursor) > 0 {
|
256
|
+
err = collector.Visit(nextCursor)
|
257
|
+
} else {
|
258
|
+
err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/groups/%d", groupId))
|
259
|
+
}
|
204
260
|
|
205
|
-
|
206
|
-
logger.
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
261
|
+
if err != nil {
|
262
|
+
logger.Error("crawl by colly err:", err)
|
263
|
+
}
|
264
|
+
return err, &result
|
265
|
+
}
|
266
|
+
|
267
|
+
func (f *Fbcolly) FetchGroupInfo(groupIdOrUsername string) (error, *pb.FacebookGroup) {
|
268
|
+
collector := f.collector.Clone()
|
269
|
+
err := setupSharedCollector(collector)
|
270
|
+
result := &pb.FacebookGroup{}
|
271
|
+
|
272
|
+
collector.OnHTML("a[href=\"#groupMenuBottom\"] h1", func(element *colly.HTMLElement) {
|
273
|
+
result.Name = element.Text
|
274
|
+
})
|
275
|
+
collector.OnHTML("a[href*=\"view=member\"]", func(element *colly.HTMLElement) {
|
276
|
+
result.Id = getNumberFromText(element.Attr("href"))
|
277
|
+
result.MemberCount, _ = strconv.ParseInt(element.DOM.Closest("tr").Find("td:last-child").Text(), 10, 64)
|
214
278
|
})
|
215
279
|
|
216
|
-
err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/groups/%
|
280
|
+
err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/groups/%s?view=info", groupIdOrUsername))
|
217
281
|
if err != nil {
|
218
282
|
logger.Error("crawl by colly err:", err)
|
219
283
|
}
|
220
|
-
return err,
|
284
|
+
return err, result
|
221
285
|
}
|
222
286
|
|
223
|
-
func (f *Fbcolly) FetchContentImages(postId int64) (error, *
|
287
|
+
func (f *Fbcolly) FetchContentImages(postId int64, nextCursor string) (error, *pb.FacebookImageList) {
|
224
288
|
collector := f.collector.Clone()
|
225
289
|
err := setupSharedCollector(collector)
|
226
|
-
|
227
|
-
var result []*fbcrawl.FacebookImage
|
290
|
+
result := pb.FacebookImageList{Images: []*pb.FacebookImage{}}
|
228
291
|
|
229
292
|
collector.OnHTML("a[href*=\"/media/set/\"]", func(element *colly.HTMLElement) {
|
230
|
-
|
231
|
-
logger.Info("Will fetch page", currentPage)
|
232
|
-
err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
|
293
|
+
result.NextCursor = "http://mbasic.facebook.com" + element.Attr("href")
|
233
294
|
})
|
234
295
|
|
235
296
|
collector.OnHTML("a[href*=\"/photo.php\"]", func(element *colly.HTMLElement) {
|
236
|
-
result = append(result, &
|
297
|
+
result.Images = append(result.Images, &pb.FacebookImage{
|
237
298
|
Id: getImageIdFromHref(element.Attr("href")),
|
238
299
|
})
|
239
300
|
//f.detailCollector.Visit(url)
|
240
301
|
})
|
302
|
+
if len(nextCursor) > 0 {
|
303
|
+
err = collector.Visit(nextCursor)
|
304
|
+
} else {
|
305
|
+
err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/media/set/?set=pcb.%d", postId))
|
306
|
+
}
|
241
307
|
|
242
|
-
err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/media/set/?set=pcb.%d", postId))
|
243
308
|
if err != nil {
|
244
309
|
logger.Error("crawl by colly err:", err)
|
245
310
|
}
|
246
|
-
return err, &
|
311
|
+
return err, &result
|
247
312
|
}
|
248
313
|
|
249
|
-
func (f *Fbcolly) FetchImageUrl(imageId int64) (error, *
|
314
|
+
func (f *Fbcolly) FetchImageUrl(imageId int64) (error, *pb.FacebookImage) {
|
250
315
|
collector := f.collector.Clone()
|
251
316
|
err := setupSharedCollector(collector)
|
252
|
-
result :=
|
317
|
+
result := pb.FacebookImage{Id: imageId}
|
253
318
|
|
254
319
|
collector.OnHTML("a", func(element *colly.HTMLElement) {
|
255
320
|
result.Url = element.Attr("href")
|
@@ -262,11 +327,10 @@ func (f *Fbcolly) FetchImageUrl(imageId int64) (error, *fbcrawl.FacebookImage) {
|
|
262
327
|
return err, &result
|
263
328
|
}
|
264
329
|
|
265
|
-
func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *
|
330
|
+
func (f *Fbcolly) FetchPost(groupId int64, postId int64, commentNextCursor string) (error, *pb.FacebookPost) {
|
266
331
|
collector := f.collector.Clone()
|
267
332
|
err := setupSharedCollector(collector)
|
268
|
-
post := &
|
269
|
-
commentPaging := 0
|
333
|
+
post := &pb.FacebookPost{Comments: &pb.CommentList{Comments: []*pb.FacebookComment{}}}
|
270
334
|
collector.OnHTML("#m_story_permalink_view", func(element *colly.HTMLElement) {
|
271
335
|
dataElement := element.DOM.Find("div[data-ft]")
|
272
336
|
if dataElement.Length() > 0 {
|
@@ -281,8 +345,8 @@ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.Facebo
|
|
281
345
|
}
|
282
346
|
logger.Info("Post ", result)
|
283
347
|
post.Id = result.TopLevelPostId
|
284
|
-
post.Group = &
|
285
|
-
post.User = &
|
348
|
+
post.Group = &pb.FacebookGroup{Id: result.PageId, Name: dataElement.Find("h3 strong:last-child a").Text()}
|
349
|
+
post.User = &pb.FacebookUser{
|
286
350
|
Id: result.ContentOwnerIdNew,
|
287
351
|
Name: dataElement.Find("h3 strong:first-child a").Text(),
|
288
352
|
}
|
@@ -300,21 +364,17 @@ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.Facebo
|
|
300
364
|
}
|
301
365
|
|
302
366
|
post.ContentLink = getUrlFromRedirectHref(dataElement.Find("a[href*=\"https://lm.facebook.com/l.php\"]").AttrOr("href", ""))
|
303
|
-
post.ReactionCount =
|
304
|
-
post.ContentImages = (funk.Map(result.PhotoAttachmentsList, func(id string) *
|
367
|
+
post.ReactionCount = getNumberFromText(element.DOM.Find("div[id*=\"sentence_\"]").Text())
|
368
|
+
post.ContentImages = (funk.Map(result.PhotoAttachmentsList, func(id string) *pb.FacebookImage {
|
305
369
|
i, _ := strconv.ParseInt(id, 10, 64)
|
306
|
-
return &
|
370
|
+
return &pb.FacebookImage{
|
307
371
|
Id: i,
|
308
372
|
}
|
309
|
-
})).([]*
|
373
|
+
})).([]*pb.FacebookImage)
|
310
374
|
|
311
375
|
if result.PhotoId > 0 {
|
312
|
-
post.ContentImage = &
|
376
|
+
post.ContentImage = &pb.FacebookImage{Id: result.PhotoId}
|
313
377
|
}
|
314
|
-
|
315
|
-
logger.Info("content", strings.Join(dataElement.Find("p").Map(func(i int, selection *goquery.Selection) string {
|
316
|
-
return selection.Text()
|
317
|
-
}), "\n"))
|
318
378
|
}
|
319
379
|
|
320
380
|
//Comment
|
@@ -323,10 +383,10 @@ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.Facebo
|
|
323
383
|
commentId, _ := strconv.ParseInt(selection.AttrOr("id", ""), 10, 64)
|
324
384
|
logger.Info("comment", commentId)
|
325
385
|
createdAtWhenResult, _ := f.w.Parse(selection.Find("abbr").Text(), time.Now())
|
326
|
-
post.Comments = append(post.Comments, &
|
386
|
+
post.Comments.Comments = append(post.Comments.Comments, &pb.FacebookComment{
|
327
387
|
Id: commentId,
|
328
|
-
Post: &
|
329
|
-
User: &
|
388
|
+
Post: &pb.FacebookPost{Id: post.Id},
|
389
|
+
User: &pb.FacebookUser{
|
330
390
|
Id: getUserIdFromCommentHref(selection.Find("a[href*=\"#comment_form_\"]").AttrOr("href", "")),
|
331
391
|
Name: selection.Find("h3 > a").Text(),
|
332
392
|
},
|
@@ -339,14 +399,14 @@ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.Facebo
|
|
339
399
|
})
|
340
400
|
|
341
401
|
collector.OnHTML("div[id*=\"see_prev_\"] > a", func(element *colly.HTMLElement) {
|
342
|
-
|
343
|
-
logger.Info("Comment paging", commentPaging)
|
344
|
-
err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
|
345
|
-
commentPaging = commentPaging + 1
|
346
|
-
}
|
402
|
+
post.Comments.NextCursor = "http://mbasic.facebook.com" + element.Attr("href")
|
347
403
|
})
|
404
|
+
if len(commentNextCursor) > 0 {
|
405
|
+
err = collector.Visit(commentNextCursor)
|
406
|
+
} else {
|
407
|
+
err = collector.Visit(fmt.Sprintf("http://mbasic.facebook.com/groups/%d?view=permalink&id=%d&_rdr", groupId, postId))
|
408
|
+
}
|
348
409
|
|
349
|
-
err = collector.Visit(fmt.Sprintf("http://mbasic.facebook.com/groups/%d?view=permalink&id=%d&_rdr", groupId, postId))
|
350
410
|
return err, post
|
351
411
|
}
|
352
412
|
|
@@ -360,8 +420,12 @@ func (f *Fbcolly) LoginWithCookies(cookies string) error {
|
|
360
420
|
//}
|
361
421
|
|
362
422
|
func getUserIdFromCommentHref(href string) int64 {
|
363
|
-
|
364
|
-
|
423
|
+
match := regexp.MustCompile("#comment_form_(\\d+)").FindStringSubmatch(href)
|
424
|
+
if len(match) > 0 {
|
425
|
+
id, _ := strconv.ParseInt(match[1], 10, 64)
|
426
|
+
return id
|
427
|
+
}
|
428
|
+
return 0
|
365
429
|
}
|
366
430
|
|
367
431
|
func getUrlFromRedirectHref(href string) string {
|
@@ -375,11 +439,20 @@ func getImageIdFromHref(href string) int64 {
|
|
375
439
|
return i
|
376
440
|
}
|
377
441
|
|
378
|
-
func
|
442
|
+
func getNumberFromText(text string) int64 {
|
379
443
|
logger.Error("reaction", text)
|
380
444
|
if len(text) > 0 {
|
381
|
-
|
382
|
-
|
445
|
+
match := regexp.MustCompile("(\\d+)\\s?([km]?)").FindStringSubmatch(text)
|
446
|
+
if len(match) > 0 {
|
447
|
+
count, _ := strconv.ParseInt(match[1], 10, 64)
|
448
|
+
switch match[2] {
|
449
|
+
case "k":
|
450
|
+
count *= 1000
|
451
|
+
case "m":
|
452
|
+
count *= 1000000
|
453
|
+
}
|
454
|
+
return count
|
455
|
+
}
|
383
456
|
}
|
384
457
|
return 0
|
385
458
|
}
|