fbcrawl-colly 0.2.6 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 64e7674aa28cfc9c5c6817afef3c3b4ed7106d1620e6da97c3fc7362ac76dc1e
4
- data.tar.gz: 440cb83ecb7bcb9c461b4c5f9d9e5b3de0f6300384074adee9c08f3be1a68e8b
3
+ metadata.gz: 00d52fcc50bd651adaa80cf6102d7cc3997a07605ebb91fd33006b6e535ed6fc
4
+ data.tar.gz: 24388fa7c98d6dfbe58ba195c330ea3c363f6ef8e586873dd9e485ad29907a6c
5
5
  SHA512:
6
- metadata.gz: 1fa921c01c8b0381cf85ce4b1544d59912e1997f47fab12f8e29636de10305f410fff12a2137f7370a9749cfd2f28cd2a3ef5953de703056bf63a49770af3daf
7
- data.tar.gz: 5696f7fe083b55ed24a5cd128dc71a8c7ebe0a7c7a0dda8811fe2e5796a023635c44d3712584b2c98937bfc1a170e7639b6af250c287bca7cbee36211f10669a
6
+ metadata.gz: 15622279b09030f929218482add87878b58fdcb1e89e1ab8a1124531435707dbe26367fee75204a55fc88f8486c47cbb6ddfdb86035aa24694b1b7d6639aaecc
7
+ data.tar.gz: df50b30ad0234c824505cca54fdf33d0be54c4725c684f0317b1eba81f89d22d7b434bec4e9373b3d43dd830dc17693ac2fb785e75cefc71c6cbeefc44da2616
data/.gitignore CHANGED
@@ -12,8 +12,6 @@
12
12
  /parse.log
13
13
  last.html
14
14
  *.db
15
- /fbcrawl/fbcrawl.pb.go
16
- /lib/fbcrawl_pb.rb
17
15
 
18
16
  mkmf.log
19
17
  .rakeTasks
@@ -0,0 +1,14 @@
1
+ FROM golang:1.14-alpine
2
+ RUN apk add --no-cache git build-base tzdata
3
+
4
+ RUN mkdir -p /app
5
+ WORKDIR /app
6
+ ADD ./go.mod /app
7
+ ADD ./go.sum /app
8
+ ADD ./ /app
9
+ RUN go get
10
+
11
+
12
+ ENV PORT 3000
13
+ RUN go build -o server qnetwork.net/fbcrawl
14
+ ENTRYPOINT ["./server"]
@@ -1,19 +1,21 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- fbcrawl-colly (0.2.5)
5
- ffi
4
+ fbcrawl-colly (1.0.0)
6
5
  google-protobuf
6
+ grpc
7
7
 
8
8
  GEM
9
9
  remote: https://rubygems.org/
10
10
  specs:
11
- ffi (1.13.1)
12
- google-protobuf (3.13.0)
11
+ google-protobuf (3.13.0-universal-darwin)
12
+ googleapis-common-protos-types (1.0.5)
13
+ google-protobuf (~> 3.11)
14
+ grpc (1.30.2-universal-darwin)
15
+ google-protobuf (~> 3.12)
16
+ googleapis-common-protos-types (~> 1.0)
13
17
  minitest (5.14.1)
14
18
  rake (12.3.3)
15
- rake-compiler (1.1.1)
16
- rake
17
19
 
18
20
  PLATFORMS
19
21
  ruby
@@ -22,7 +24,6 @@ DEPENDENCIES
22
24
  fbcrawl-colly!
23
25
  minitest (~> 5.0)
24
26
  rake (~> 12.0)
25
- rake-compiler
26
27
 
27
28
  BUNDLED WITH
28
29
  2.1.4
data/README.md CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  This project is to crawl mbasic.facebook.com using GO Colly.
4
4
 
5
- Ruby gem will available to use
5
+ Ruby gem is only client for GRPC colly service
6
6
 
7
7
  ## Installation
8
8
 
@@ -25,12 +25,8 @@ Gem::Specification.new do |spec|
25
25
  end
26
26
  # spec.bindir = "exe"
27
27
  # spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
28
- spec.extensions = [
29
- 'ext/fbcrawl_colly/extconf.rb'
30
- ]
31
- spec.require_paths = ["lib"]
28
+ spec.require_paths = %w[lib lib/pb]
32
29
 
33
- spec.add_runtime_dependency 'ffi'
34
30
  spec.add_runtime_dependency 'google-protobuf'
35
- spec.add_development_dependency 'rake-compiler'
31
+ spec.add_runtime_dependency 'grpc'
36
32
  end
@@ -1,7 +1,73 @@
1
1
  syntax = "proto3";
2
2
 
3
3
  package fbcrawl_colly;
4
- option go_package = "./fbcrawl;fbcrawl";
4
+ option go_package = "./fbcrawl/pb;pb";
5
+
6
+ service Grpc {
7
+ // Sends a greeting
8
+ rpc Init (Empty) returns (Pointer) {}
9
+ rpc FreeColly (Pointer) returns (Empty) {}
10
+ rpc Login (LoginRequest) returns (LoginResponse) {}
11
+ rpc LoginWithCookies (LoginWithCookiesRequest) returns (Empty) {}
12
+ rpc FetchGroupInfo (FetchGroupInfoRequest) returns (FacebookGroup) {}
13
+ rpc FetchGroupFeed (FetchGroupFeedRequest) returns (FacebookPostList) {}
14
+ rpc FetchPost (FetchPostRequest) returns (FacebookPost) {}
15
+ rpc FetchContentImages (FetchContentImagesRequest) returns (FacebookImageList) {}
16
+ rpc FetchImageUrl (FetchImageUrlRequest) returns (FacebookImage) {}
17
+ }
18
+
19
+ message Empty {
20
+
21
+ }
22
+
23
+ message Pointer {
24
+ int64 address = 1;
25
+ }
26
+
27
+ message LoginRequest {
28
+ Pointer pointer = 1;
29
+ string email = 2;
30
+ string password = 3;
31
+ string totp_secret = 4;
32
+ }
33
+
34
+ message LoginResponse {
35
+ string cookies = 1;
36
+ }
37
+
38
+ message LoginWithCookiesRequest {
39
+ Pointer pointer = 1;
40
+ string cookies = 2;
41
+ }
42
+
43
+ message FetchGroupInfoRequest {
44
+ Pointer pointer = 1;
45
+ string group_username = 2;
46
+ }
47
+
48
+ message FetchGroupFeedRequest {
49
+ Pointer pointer = 1;
50
+ int64 group_id = 2;
51
+ string next_cursor = 3;
52
+ }
53
+
54
+ message FetchPostRequest {
55
+ Pointer pointer = 1;
56
+ int64 group_id = 2;
57
+ int64 post_id = 3;
58
+ string comment_next_cursor = 4;
59
+ }
60
+
61
+ message FetchContentImagesRequest {
62
+ Pointer pointer = 1;
63
+ int64 post_id = 2;
64
+ string next_cursor = 3;
65
+ }
66
+
67
+ message FetchImageUrlRequest {
68
+ Pointer pointer = 1;
69
+ int64 image_id = 2;
70
+ }
5
71
 
6
72
  // The request message containing the user's name.
7
73
  message FacebookGroup {
@@ -20,15 +86,20 @@ message FacebookPost {
20
86
  FacebookGroup group = 2;
21
87
  FacebookUser user = 3;
22
88
  string content = 4;
89
+ CommentList comments = 5;
23
90
  string content_link = 6;
24
- FacebookImage content_image = 8;
25
91
  repeated FacebookImage content_images = 7;
26
- repeated FacebookComment comments = 5;
92
+ FacebookImage content_image = 8;
27
93
  int64 created_at = 9;
28
94
  int64 reaction_count = 10;
29
95
  int64 comment_count = 11;
30
96
  }
31
97
 
98
+ message CommentList {
99
+ repeated FacebookComment comments = 5;
100
+ string next_cursor = 12;
101
+ }
102
+
32
103
  message FacebookImage {
33
104
  int64 id = 1;
34
105
  string url = 2;
@@ -44,8 +115,10 @@ message FacebookComment {
44
115
 
45
116
  message FacebookPostList {
46
117
  repeated FacebookPost posts = 1;
118
+ string next_cursor = 2;
47
119
  }
48
120
 
49
121
  message FacebookImageList {
50
122
  repeated FacebookImage images = 1;
123
+ string next_cursor = 2;
51
124
  }
@@ -15,8 +15,9 @@ import (
15
15
  "github.com/olebedev/when/rules/common"
16
16
  "github.com/olebedev/when/rules/en"
17
17
  "github.com/thoas/go-funk"
18
+ "github.com/xlzd/gotp"
18
19
  "net/url"
19
- "qnetwork.net/fbcrawl/fbcrawl"
20
+ "qnetwork.net/fbcrawl/fbcrawl/pb"
20
21
  "regexp"
21
22
  "strconv"
22
23
  "strings"
@@ -108,7 +109,7 @@ func New() *Fbcolly {
108
109
  return &f
109
110
  }
110
111
 
111
- func (f *Fbcolly) Login(email string, password string, otp string) (string, error) {
112
+ func (f *Fbcolly) Login(email string, password string, totpSecret string) (string, error) {
112
113
  collector := f.collector.Clone()
113
114
  err := setupSharedCollector(collector)
114
115
 
@@ -157,9 +158,12 @@ func (f *Fbcolly) Login(email string, password string, otp string) (string, erro
157
158
  //logger.Info("Please input OTP")
158
159
  //reader := bufio.NewReader(os.Stdin)
159
160
  //code, _ := reader.ReadString('\n')
160
- code := otp[0:6]
161
- reqMap["approvals_code"] = code
162
- shouldSubmit = true
161
+ if len(totpSecret) > 0 {
162
+ code := gotp.NewDefaultTOTP(totpSecret).Now()
163
+ reqMap["approvals_code"] = code
164
+ shouldSubmit = true
165
+ }
166
+
163
167
  } else {
164
168
  logger.Info("OnHTML Only Continue checkpoint")
165
169
 
@@ -193,22 +197,17 @@ func (f *Fbcolly) Login(email string, password string, otp string) (string, erro
193
197
 
194
198
  }
195
199
 
196
- func (f *Fbcolly) FetchGroupFeed(groupId int64) (error, *fbcrawl.FacebookPostList) {
200
+ func (f *Fbcolly) FetchGroupFeed(groupId int64, nextCursor string) (error, *pb.FacebookPostList) {
197
201
  collector := f.collector.Clone()
198
202
  err := setupSharedCollector(collector)
199
- currentPage := 1
200
- var result []*fbcrawl.FacebookPost
203
+ result := pb.FacebookPostList{Posts: []*pb.FacebookPost{}}
201
204
 
202
205
  collector.OnHTML("#m_group_stories_container > :last-child a", func(element *colly.HTMLElement) {
203
- currentPage++
204
- if currentPage < 3 {
205
- logger.Info("Will fetch page", currentPage)
206
- err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
207
- }
206
+ result.NextCursor = "http://mbasic.facebook.com" + element.Attr("href")
208
207
  })
209
208
  collector.OnHTML("div[role=\"article\"]", func(element *colly.HTMLElement) {
210
209
  dataElement := element
211
- post := &fbcrawl.FacebookPost{}
210
+ post := &pb.FacebookPost{}
212
211
  var fbDataFt FbDataFt
213
212
  jsonData := dataElement.Attr("data-ft")
214
213
 
@@ -220,8 +219,8 @@ func (f *Fbcolly) FetchGroupFeed(groupId int64) (error, *fbcrawl.FacebookPostLis
220
219
  }
221
220
  logger.Info("Post ", fbDataFt)
222
221
  post.Id = fbDataFt.TopLevelPostId
223
- post.Group = &fbcrawl.FacebookGroup{Id: fbDataFt.PageId, Name: dataElement.DOM.Find("h3 strong:nth-child(2) a").Text()}
224
- post.User = &fbcrawl.FacebookUser{
222
+ post.Group = &pb.FacebookGroup{Id: fbDataFt.PageId, Name: dataElement.DOM.Find("h3 strong:nth-child(2) a").Text()}
223
+ post.User = &pb.FacebookUser{
225
224
  Id: fbDataFt.ContentOwnerIdNew,
226
225
  Name: dataElement.DOM.Find("h3 strong:nth-child(1) a").Text(),
227
226
  }
@@ -241,30 +240,34 @@ func (f *Fbcolly) FetchGroupFeed(groupId int64) (error, *fbcrawl.FacebookPostLis
241
240
  post.ContentLink = getUrlFromRedirectHref(dataElement.DOM.Find("a[href*=\"https://lm.facebook.com/l.php\"]").AttrOr("href", ""))
242
241
  post.ReactionCount = getNumberFromText(element.DOM.Find("span[id*=\"like_\"]").Text())
243
242
  post.CommentCount = getNumberFromText(element.DOM.Find("span[id*=\"like_\"] ~ a").Text())
244
- post.ContentImages = (funk.Map(fbDataFt.PhotoAttachmentsList, func(id string) *fbcrawl.FacebookImage {
243
+ post.ContentImages = (funk.Map(fbDataFt.PhotoAttachmentsList, func(id string) *pb.FacebookImage {
245
244
  i, _ := strconv.ParseInt(id, 10, 64)
246
- return &fbcrawl.FacebookImage{
245
+ return &pb.FacebookImage{
247
246
  Id: i,
248
247
  }
249
- })).([]*fbcrawl.FacebookImage)
248
+ })).([]*pb.FacebookImage)
250
249
 
251
250
  if fbDataFt.PhotoId > 0 {
252
- post.ContentImage = &fbcrawl.FacebookImage{Id: fbDataFt.PhotoId}
251
+ post.ContentImage = &pb.FacebookImage{Id: fbDataFt.PhotoId}
253
252
  }
254
- result = append(result, post)
253
+ result.Posts = append(result.Posts, post)
255
254
  })
255
+ if len(nextCursor) > 0 {
256
+ err = collector.Visit(nextCursor)
257
+ } else {
258
+ err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/groups/%d", groupId))
259
+ }
256
260
 
257
- err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/groups/%d", groupId))
258
261
  if err != nil {
259
262
  logger.Error("crawl by colly err:", err)
260
263
  }
261
- return err, &fbcrawl.FacebookPostList{Posts: result}
264
+ return err, &result
262
265
  }
263
266
 
264
- func (f *Fbcolly) FetchGroupInfo(groupIdOrUsername string) (error, *fbcrawl.FacebookGroup) {
267
+ func (f *Fbcolly) FetchGroupInfo(groupIdOrUsername string) (error, *pb.FacebookGroup) {
265
268
  collector := f.collector.Clone()
266
269
  err := setupSharedCollector(collector)
267
- result := &fbcrawl.FacebookGroup{}
270
+ result := &pb.FacebookGroup{}
268
271
 
269
272
  collector.OnHTML("a[href=\"#groupMenuBottom\"] h1", func(element *colly.HTMLElement) {
270
273
  result.Name = element.Text
@@ -281,36 +284,37 @@ func (f *Fbcolly) FetchGroupInfo(groupIdOrUsername string) (error, *fbcrawl.Face
281
284
  return err, result
282
285
  }
283
286
 
284
- func (f *Fbcolly) FetchContentImages(postId int64) (error, *fbcrawl.FacebookImageList) {
287
+ func (f *Fbcolly) FetchContentImages(postId int64, nextCursor string) (error, *pb.FacebookImageList) {
285
288
  collector := f.collector.Clone()
286
289
  err := setupSharedCollector(collector)
287
- currentPage := 1
288
- var result []*fbcrawl.FacebookImage
290
+ result := pb.FacebookImageList{Images: []*pb.FacebookImage{}}
289
291
 
290
292
  collector.OnHTML("a[href*=\"/media/set/\"]", func(element *colly.HTMLElement) {
291
- currentPage++
292
- logger.Info("Will fetch page", currentPage)
293
- err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
293
+ result.NextCursor = "http://mbasic.facebook.com" + element.Attr("href")
294
294
  })
295
295
 
296
296
  collector.OnHTML("a[href*=\"/photo.php\"]", func(element *colly.HTMLElement) {
297
- result = append(result, &fbcrawl.FacebookImage{
297
+ result.Images = append(result.Images, &pb.FacebookImage{
298
298
  Id: getImageIdFromHref(element.Attr("href")),
299
299
  })
300
300
  //f.detailCollector.Visit(url)
301
301
  })
302
+ if len(nextCursor) > 0 {
303
+ err = collector.Visit(nextCursor)
304
+ } else {
305
+ err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/media/set/?set=pcb.%d", postId))
306
+ }
302
307
 
303
- err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/media/set/?set=pcb.%d", postId))
304
308
  if err != nil {
305
309
  logger.Error("crawl by colly err:", err)
306
310
  }
307
- return err, &fbcrawl.FacebookImageList{Images: result}
311
+ return err, &result
308
312
  }
309
313
 
310
- func (f *Fbcolly) FetchImageUrl(imageId int64) (error, *fbcrawl.FacebookImage) {
314
+ func (f *Fbcolly) FetchImageUrl(imageId int64) (error, *pb.FacebookImage) {
311
315
  collector := f.collector.Clone()
312
316
  err := setupSharedCollector(collector)
313
- result := fbcrawl.FacebookImage{Id: imageId}
317
+ result := pb.FacebookImage{Id: imageId}
314
318
 
315
319
  collector.OnHTML("a", func(element *colly.HTMLElement) {
316
320
  result.Url = element.Attr("href")
@@ -323,11 +327,10 @@ func (f *Fbcolly) FetchImageUrl(imageId int64) (error, *fbcrawl.FacebookImage) {
323
327
  return err, &result
324
328
  }
325
329
 
326
- func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.FacebookPost) {
330
+ func (f *Fbcolly) FetchPost(groupId int64, postId int64, commentNextCursor string) (error, *pb.FacebookPost) {
327
331
  collector := f.collector.Clone()
328
332
  err := setupSharedCollector(collector)
329
- post := &fbcrawl.FacebookPost{Comments: []*fbcrawl.FacebookComment{}}
330
- commentPaging := 0
333
+ post := &pb.FacebookPost{Comments: &pb.CommentList{Comments: []*pb.FacebookComment{}}}
331
334
  collector.OnHTML("#m_story_permalink_view", func(element *colly.HTMLElement) {
332
335
  dataElement := element.DOM.Find("div[data-ft]")
333
336
  if dataElement.Length() > 0 {
@@ -342,8 +345,8 @@ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.Facebo
342
345
  }
343
346
  logger.Info("Post ", result)
344
347
  post.Id = result.TopLevelPostId
345
- post.Group = &fbcrawl.FacebookGroup{Id: result.PageId, Name: dataElement.Find("h3 strong:last-child a").Text()}
346
- post.User = &fbcrawl.FacebookUser{
348
+ post.Group = &pb.FacebookGroup{Id: result.PageId, Name: dataElement.Find("h3 strong:last-child a").Text()}
349
+ post.User = &pb.FacebookUser{
347
350
  Id: result.ContentOwnerIdNew,
348
351
  Name: dataElement.Find("h3 strong:first-child a").Text(),
349
352
  }
@@ -362,15 +365,15 @@ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.Facebo
362
365
 
363
366
  post.ContentLink = getUrlFromRedirectHref(dataElement.Find("a[href*=\"https://lm.facebook.com/l.php\"]").AttrOr("href", ""))
364
367
  post.ReactionCount = getNumberFromText(element.DOM.Find("div[id*=\"sentence_\"]").Text())
365
- post.ContentImages = (funk.Map(result.PhotoAttachmentsList, func(id string) *fbcrawl.FacebookImage {
368
+ post.ContentImages = (funk.Map(result.PhotoAttachmentsList, func(id string) *pb.FacebookImage {
366
369
  i, _ := strconv.ParseInt(id, 10, 64)
367
- return &fbcrawl.FacebookImage{
370
+ return &pb.FacebookImage{
368
371
  Id: i,
369
372
  }
370
- })).([]*fbcrawl.FacebookImage)
373
+ })).([]*pb.FacebookImage)
371
374
 
372
375
  if result.PhotoId > 0 {
373
- post.ContentImage = &fbcrawl.FacebookImage{Id: result.PhotoId}
376
+ post.ContentImage = &pb.FacebookImage{Id: result.PhotoId}
374
377
  }
375
378
  }
376
379
 
@@ -380,10 +383,10 @@ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.Facebo
380
383
  commentId, _ := strconv.ParseInt(selection.AttrOr("id", ""), 10, 64)
381
384
  logger.Info("comment", commentId)
382
385
  createdAtWhenResult, _ := f.w.Parse(selection.Find("abbr").Text(), time.Now())
383
- post.Comments = append(post.Comments, &fbcrawl.FacebookComment{
386
+ post.Comments.Comments = append(post.Comments.Comments, &pb.FacebookComment{
384
387
  Id: commentId,
385
- Post: &fbcrawl.FacebookPost{Id: post.Id},
386
- User: &fbcrawl.FacebookUser{
388
+ Post: &pb.FacebookPost{Id: post.Id},
389
+ User: &pb.FacebookUser{
387
390
  Id: getUserIdFromCommentHref(selection.Find("a[href*=\"#comment_form_\"]").AttrOr("href", "")),
388
391
  Name: selection.Find("h3 > a").Text(),
389
392
  },
@@ -396,14 +399,14 @@ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.Facebo
396
399
  })
397
400
 
398
401
  collector.OnHTML("div[id*=\"see_prev_\"] > a", func(element *colly.HTMLElement) {
399
- if commentPaging < 3 {
400
- logger.Info("Comment paging", commentPaging)
401
- err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
402
- commentPaging = commentPaging + 1
403
- }
402
+ post.Comments.NextCursor = "http://mbasic.facebook.com" + element.Attr("href")
404
403
  })
404
+ if len(commentNextCursor) > 0 {
405
+ err = collector.Visit(commentNextCursor)
406
+ } else {
407
+ err = collector.Visit(fmt.Sprintf("http://mbasic.facebook.com/groups/%d?view=permalink&id=%d&_rdr", groupId, postId))
408
+ }
405
409
 
406
- err = collector.Visit(fmt.Sprintf("http://mbasic.facebook.com/groups/%d?view=permalink&id=%d&_rdr", groupId, postId))
407
410
  return err, post
408
411
  }
409
412
 
@@ -417,8 +420,12 @@ func (f *Fbcolly) LoginWithCookies(cookies string) error {
417
420
  //}
418
421
 
419
422
  func getUserIdFromCommentHref(href string) int64 {
420
- id, _ := strconv.ParseInt(regexp.MustCompile("#comment_form_(\\d+)").FindStringSubmatch(href)[1], 10, 64)
421
- return id
423
+ match := regexp.MustCompile("#comment_form_(\\d+)").FindStringSubmatch(href)
424
+ if len(match) > 0 {
425
+ id, _ := strconv.ParseInt(match[1], 10, 64)
426
+ return id
427
+ }
428
+ return 0
422
429
  }
423
430
 
424
431
  func getUrlFromRedirectHref(href string) string {