fbcrawl-colly 0.2.6 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 64e7674aa28cfc9c5c6817afef3c3b4ed7106d1620e6da97c3fc7362ac76dc1e
4
- data.tar.gz: 440cb83ecb7bcb9c461b4c5f9d9e5b3de0f6300384074adee9c08f3be1a68e8b
3
+ metadata.gz: 00d52fcc50bd651adaa80cf6102d7cc3997a07605ebb91fd33006b6e535ed6fc
4
+ data.tar.gz: 24388fa7c98d6dfbe58ba195c330ea3c363f6ef8e586873dd9e485ad29907a6c
5
5
  SHA512:
6
- metadata.gz: 1fa921c01c8b0381cf85ce4b1544d59912e1997f47fab12f8e29636de10305f410fff12a2137f7370a9749cfd2f28cd2a3ef5953de703056bf63a49770af3daf
7
- data.tar.gz: 5696f7fe083b55ed24a5cd128dc71a8c7ebe0a7c7a0dda8811fe2e5796a023635c44d3712584b2c98937bfc1a170e7639b6af250c287bca7cbee36211f10669a
6
+ metadata.gz: 15622279b09030f929218482add87878b58fdcb1e89e1ab8a1124531435707dbe26367fee75204a55fc88f8486c47cbb6ddfdb86035aa24694b1b7d6639aaecc
7
+ data.tar.gz: df50b30ad0234c824505cca54fdf33d0be54c4725c684f0317b1eba81f89d22d7b434bec4e9373b3d43dd830dc17693ac2fb785e75cefc71c6cbeefc44da2616
data/.gitignore CHANGED
@@ -12,8 +12,6 @@
12
12
  /parse.log
13
13
  last.html
14
14
  *.db
15
- /fbcrawl/fbcrawl.pb.go
16
- /lib/fbcrawl_pb.rb
17
15
 
18
16
  mkmf.log
19
17
  .rakeTasks
@@ -0,0 +1,14 @@
1
+ FROM golang:1.14-alpine
2
+ RUN apk add --no-cache git build-base tzdata
3
+
4
+ RUN mkdir -p /app
5
+ WORKDIR /app
6
+ ADD ./go.mod /app
7
+ ADD ./go.sum /app
8
+ ADD ./ /app
9
+ RUN go get
10
+
11
+
12
+ ENV PORT 3000
13
+ RUN go build -o server qnetwork.net/fbcrawl
14
+ ENTRYPOINT ["./server"]
@@ -1,19 +1,21 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- fbcrawl-colly (0.2.5)
5
- ffi
4
+ fbcrawl-colly (1.0.0)
6
5
  google-protobuf
6
+ grpc
7
7
 
8
8
  GEM
9
9
  remote: https://rubygems.org/
10
10
  specs:
11
- ffi (1.13.1)
12
- google-protobuf (3.13.0)
11
+ google-protobuf (3.13.0-universal-darwin)
12
+ googleapis-common-protos-types (1.0.5)
13
+ google-protobuf (~> 3.11)
14
+ grpc (1.30.2-universal-darwin)
15
+ google-protobuf (~> 3.12)
16
+ googleapis-common-protos-types (~> 1.0)
13
17
  minitest (5.14.1)
14
18
  rake (12.3.3)
15
- rake-compiler (1.1.1)
16
- rake
17
19
 
18
20
  PLATFORMS
19
21
  ruby
@@ -22,7 +24,6 @@ DEPENDENCIES
22
24
  fbcrawl-colly!
23
25
  minitest (~> 5.0)
24
26
  rake (~> 12.0)
25
- rake-compiler
26
27
 
27
28
  BUNDLED WITH
28
29
  2.1.4
data/README.md CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  This project is to crawl mbasic.facebook.com using GO Colly.
4
4
 
5
- Ruby gem will available to use
5
+ Ruby gem is only client for GRPC colly service
6
6
 
7
7
  ## Installation
8
8
 
@@ -25,12 +25,8 @@ Gem::Specification.new do |spec|
25
25
  end
26
26
  # spec.bindir = "exe"
27
27
  # spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
28
- spec.extensions = [
29
- 'ext/fbcrawl_colly/extconf.rb'
30
- ]
31
- spec.require_paths = ["lib"]
28
+ spec.require_paths = %w[lib lib/pb]
32
29
 
33
- spec.add_runtime_dependency 'ffi'
34
30
  spec.add_runtime_dependency 'google-protobuf'
35
- spec.add_development_dependency 'rake-compiler'
31
+ spec.add_runtime_dependency 'grpc'
36
32
  end
@@ -1,7 +1,73 @@
1
1
  syntax = "proto3";
2
2
 
3
3
  package fbcrawl_colly;
4
- option go_package = "./fbcrawl;fbcrawl";
4
+ option go_package = "./fbcrawl/pb;pb";
5
+
6
+ service Grpc {
7
+ // Sends a greeting
8
+ rpc Init (Empty) returns (Pointer) {}
9
+ rpc FreeColly (Pointer) returns (Empty) {}
10
+ rpc Login (LoginRequest) returns (LoginResponse) {}
11
+ rpc LoginWithCookies (LoginWithCookiesRequest) returns (Empty) {}
12
+ rpc FetchGroupInfo (FetchGroupInfoRequest) returns (FacebookGroup) {}
13
+ rpc FetchGroupFeed (FetchGroupFeedRequest) returns (FacebookPostList) {}
14
+ rpc FetchPost (FetchPostRequest) returns (FacebookPost) {}
15
+ rpc FetchContentImages (FetchContentImagesRequest) returns (FacebookImageList) {}
16
+ rpc FetchImageUrl (FetchImageUrlRequest) returns (FacebookImage) {}
17
+ }
18
+
19
+ message Empty {
20
+
21
+ }
22
+
23
+ message Pointer {
24
+ int64 address = 1;
25
+ }
26
+
27
+ message LoginRequest {
28
+ Pointer pointer = 1;
29
+ string email = 2;
30
+ string password = 3;
31
+ string totp_secret = 4;
32
+ }
33
+
34
+ message LoginResponse {
35
+ string cookies = 1;
36
+ }
37
+
38
+ message LoginWithCookiesRequest {
39
+ Pointer pointer = 1;
40
+ string cookies = 2;
41
+ }
42
+
43
+ message FetchGroupInfoRequest {
44
+ Pointer pointer = 1;
45
+ string group_username = 2;
46
+ }
47
+
48
+ message FetchGroupFeedRequest {
49
+ Pointer pointer = 1;
50
+ int64 group_id = 2;
51
+ string next_cursor = 3;
52
+ }
53
+
54
+ message FetchPostRequest {
55
+ Pointer pointer = 1;
56
+ int64 group_id = 2;
57
+ int64 post_id = 3;
58
+ string comment_next_cursor = 4;
59
+ }
60
+
61
+ message FetchContentImagesRequest {
62
+ Pointer pointer = 1;
63
+ int64 post_id = 2;
64
+ string next_cursor = 3;
65
+ }
66
+
67
+ message FetchImageUrlRequest {
68
+ Pointer pointer = 1;
69
+ int64 image_id = 2;
70
+ }
5
71
 
6
72
  // The request message containing the user's name.
7
73
  message FacebookGroup {
@@ -20,15 +86,20 @@ message FacebookPost {
20
86
  FacebookGroup group = 2;
21
87
  FacebookUser user = 3;
22
88
  string content = 4;
89
+ CommentList comments = 5;
23
90
  string content_link = 6;
24
- FacebookImage content_image = 8;
25
91
  repeated FacebookImage content_images = 7;
26
- repeated FacebookComment comments = 5;
92
+ FacebookImage content_image = 8;
27
93
  int64 created_at = 9;
28
94
  int64 reaction_count = 10;
29
95
  int64 comment_count = 11;
30
96
  }
31
97
 
98
+ message CommentList {
99
+ repeated FacebookComment comments = 5;
100
+ string next_cursor = 12;
101
+ }
102
+
32
103
  message FacebookImage {
33
104
  int64 id = 1;
34
105
  string url = 2;
@@ -44,8 +115,10 @@ message FacebookComment {
44
115
 
45
116
  message FacebookPostList {
46
117
  repeated FacebookPost posts = 1;
118
+ string next_cursor = 2;
47
119
  }
48
120
 
49
121
  message FacebookImageList {
50
122
  repeated FacebookImage images = 1;
123
+ string next_cursor = 2;
51
124
  }
@@ -15,8 +15,9 @@ import (
15
15
  "github.com/olebedev/when/rules/common"
16
16
  "github.com/olebedev/when/rules/en"
17
17
  "github.com/thoas/go-funk"
18
+ "github.com/xlzd/gotp"
18
19
  "net/url"
19
- "qnetwork.net/fbcrawl/fbcrawl"
20
+ "qnetwork.net/fbcrawl/fbcrawl/pb"
20
21
  "regexp"
21
22
  "strconv"
22
23
  "strings"
@@ -108,7 +109,7 @@ func New() *Fbcolly {
108
109
  return &f
109
110
  }
110
111
 
111
- func (f *Fbcolly) Login(email string, password string, otp string) (string, error) {
112
+ func (f *Fbcolly) Login(email string, password string, totpSecret string) (string, error) {
112
113
  collector := f.collector.Clone()
113
114
  err := setupSharedCollector(collector)
114
115
 
@@ -157,9 +158,12 @@ func (f *Fbcolly) Login(email string, password string, otp string) (string, erro
157
158
  //logger.Info("Please input OTP")
158
159
  //reader := bufio.NewReader(os.Stdin)
159
160
  //code, _ := reader.ReadString('\n')
160
- code := otp[0:6]
161
- reqMap["approvals_code"] = code
162
- shouldSubmit = true
161
+ if len(totpSecret) > 0 {
162
+ code := gotp.NewDefaultTOTP(totpSecret).Now()
163
+ reqMap["approvals_code"] = code
164
+ shouldSubmit = true
165
+ }
166
+
163
167
  } else {
164
168
  logger.Info("OnHTML Only Continue checkpoint")
165
169
 
@@ -193,22 +197,17 @@ func (f *Fbcolly) Login(email string, password string, otp string) (string, erro
193
197
 
194
198
  }
195
199
 
196
- func (f *Fbcolly) FetchGroupFeed(groupId int64) (error, *fbcrawl.FacebookPostList) {
200
+ func (f *Fbcolly) FetchGroupFeed(groupId int64, nextCursor string) (error, *pb.FacebookPostList) {
197
201
  collector := f.collector.Clone()
198
202
  err := setupSharedCollector(collector)
199
- currentPage := 1
200
- var result []*fbcrawl.FacebookPost
203
+ result := pb.FacebookPostList{Posts: []*pb.FacebookPost{}}
201
204
 
202
205
  collector.OnHTML("#m_group_stories_container > :last-child a", func(element *colly.HTMLElement) {
203
- currentPage++
204
- if currentPage < 3 {
205
- logger.Info("Will fetch page", currentPage)
206
- err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
207
- }
206
+ result.NextCursor = "http://mbasic.facebook.com" + element.Attr("href")
208
207
  })
209
208
  collector.OnHTML("div[role=\"article\"]", func(element *colly.HTMLElement) {
210
209
  dataElement := element
211
- post := &fbcrawl.FacebookPost{}
210
+ post := &pb.FacebookPost{}
212
211
  var fbDataFt FbDataFt
213
212
  jsonData := dataElement.Attr("data-ft")
214
213
 
@@ -220,8 +219,8 @@ func (f *Fbcolly) FetchGroupFeed(groupId int64) (error, *fbcrawl.FacebookPostLis
220
219
  }
221
220
  logger.Info("Post ", fbDataFt)
222
221
  post.Id = fbDataFt.TopLevelPostId
223
- post.Group = &fbcrawl.FacebookGroup{Id: fbDataFt.PageId, Name: dataElement.DOM.Find("h3 strong:nth-child(2) a").Text()}
224
- post.User = &fbcrawl.FacebookUser{
222
+ post.Group = &pb.FacebookGroup{Id: fbDataFt.PageId, Name: dataElement.DOM.Find("h3 strong:nth-child(2) a").Text()}
223
+ post.User = &pb.FacebookUser{
225
224
  Id: fbDataFt.ContentOwnerIdNew,
226
225
  Name: dataElement.DOM.Find("h3 strong:nth-child(1) a").Text(),
227
226
  }
@@ -241,30 +240,34 @@ func (f *Fbcolly) FetchGroupFeed(groupId int64) (error, *fbcrawl.FacebookPostLis
241
240
  post.ContentLink = getUrlFromRedirectHref(dataElement.DOM.Find("a[href*=\"https://lm.facebook.com/l.php\"]").AttrOr("href", ""))
242
241
  post.ReactionCount = getNumberFromText(element.DOM.Find("span[id*=\"like_\"]").Text())
243
242
  post.CommentCount = getNumberFromText(element.DOM.Find("span[id*=\"like_\"] ~ a").Text())
244
- post.ContentImages = (funk.Map(fbDataFt.PhotoAttachmentsList, func(id string) *fbcrawl.FacebookImage {
243
+ post.ContentImages = (funk.Map(fbDataFt.PhotoAttachmentsList, func(id string) *pb.FacebookImage {
245
244
  i, _ := strconv.ParseInt(id, 10, 64)
246
- return &fbcrawl.FacebookImage{
245
+ return &pb.FacebookImage{
247
246
  Id: i,
248
247
  }
249
- })).([]*fbcrawl.FacebookImage)
248
+ })).([]*pb.FacebookImage)
250
249
 
251
250
  if fbDataFt.PhotoId > 0 {
252
- post.ContentImage = &fbcrawl.FacebookImage{Id: fbDataFt.PhotoId}
251
+ post.ContentImage = &pb.FacebookImage{Id: fbDataFt.PhotoId}
253
252
  }
254
- result = append(result, post)
253
+ result.Posts = append(result.Posts, post)
255
254
  })
255
+ if len(nextCursor) > 0 {
256
+ err = collector.Visit(nextCursor)
257
+ } else {
258
+ err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/groups/%d", groupId))
259
+ }
256
260
 
257
- err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/groups/%d", groupId))
258
261
  if err != nil {
259
262
  logger.Error("crawl by colly err:", err)
260
263
  }
261
- return err, &fbcrawl.FacebookPostList{Posts: result}
264
+ return err, &result
262
265
  }
263
266
 
264
- func (f *Fbcolly) FetchGroupInfo(groupIdOrUsername string) (error, *fbcrawl.FacebookGroup) {
267
+ func (f *Fbcolly) FetchGroupInfo(groupIdOrUsername string) (error, *pb.FacebookGroup) {
265
268
  collector := f.collector.Clone()
266
269
  err := setupSharedCollector(collector)
267
- result := &fbcrawl.FacebookGroup{}
270
+ result := &pb.FacebookGroup{}
268
271
 
269
272
  collector.OnHTML("a[href=\"#groupMenuBottom\"] h1", func(element *colly.HTMLElement) {
270
273
  result.Name = element.Text
@@ -281,36 +284,37 @@ func (f *Fbcolly) FetchGroupInfo(groupIdOrUsername string) (error, *fbcrawl.Face
281
284
  return err, result
282
285
  }
283
286
 
284
- func (f *Fbcolly) FetchContentImages(postId int64) (error, *fbcrawl.FacebookImageList) {
287
+ func (f *Fbcolly) FetchContentImages(postId int64, nextCursor string) (error, *pb.FacebookImageList) {
285
288
  collector := f.collector.Clone()
286
289
  err := setupSharedCollector(collector)
287
- currentPage := 1
288
- var result []*fbcrawl.FacebookImage
290
+ result := pb.FacebookImageList{Images: []*pb.FacebookImage{}}
289
291
 
290
292
  collector.OnHTML("a[href*=\"/media/set/\"]", func(element *colly.HTMLElement) {
291
- currentPage++
292
- logger.Info("Will fetch page", currentPage)
293
- err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
293
+ result.NextCursor = "http://mbasic.facebook.com" + element.Attr("href")
294
294
  })
295
295
 
296
296
  collector.OnHTML("a[href*=\"/photo.php\"]", func(element *colly.HTMLElement) {
297
- result = append(result, &fbcrawl.FacebookImage{
297
+ result.Images = append(result.Images, &pb.FacebookImage{
298
298
  Id: getImageIdFromHref(element.Attr("href")),
299
299
  })
300
300
  //f.detailCollector.Visit(url)
301
301
  })
302
+ if len(nextCursor) > 0 {
303
+ err = collector.Visit(nextCursor)
304
+ } else {
305
+ err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/media/set/?set=pcb.%d", postId))
306
+ }
302
307
 
303
- err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/media/set/?set=pcb.%d", postId))
304
308
  if err != nil {
305
309
  logger.Error("crawl by colly err:", err)
306
310
  }
307
- return err, &fbcrawl.FacebookImageList{Images: result}
311
+ return err, &result
308
312
  }
309
313
 
310
- func (f *Fbcolly) FetchImageUrl(imageId int64) (error, *fbcrawl.FacebookImage) {
314
+ func (f *Fbcolly) FetchImageUrl(imageId int64) (error, *pb.FacebookImage) {
311
315
  collector := f.collector.Clone()
312
316
  err := setupSharedCollector(collector)
313
- result := fbcrawl.FacebookImage{Id: imageId}
317
+ result := pb.FacebookImage{Id: imageId}
314
318
 
315
319
  collector.OnHTML("a", func(element *colly.HTMLElement) {
316
320
  result.Url = element.Attr("href")
@@ -323,11 +327,10 @@ func (f *Fbcolly) FetchImageUrl(imageId int64) (error, *fbcrawl.FacebookImage) {
323
327
  return err, &result
324
328
  }
325
329
 
326
- func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.FacebookPost) {
330
+ func (f *Fbcolly) FetchPost(groupId int64, postId int64, commentNextCursor string) (error, *pb.FacebookPost) {
327
331
  collector := f.collector.Clone()
328
332
  err := setupSharedCollector(collector)
329
- post := &fbcrawl.FacebookPost{Comments: []*fbcrawl.FacebookComment{}}
330
- commentPaging := 0
333
+ post := &pb.FacebookPost{Comments: &pb.CommentList{Comments: []*pb.FacebookComment{}}}
331
334
  collector.OnHTML("#m_story_permalink_view", func(element *colly.HTMLElement) {
332
335
  dataElement := element.DOM.Find("div[data-ft]")
333
336
  if dataElement.Length() > 0 {
@@ -342,8 +345,8 @@ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.Facebo
342
345
  }
343
346
  logger.Info("Post ", result)
344
347
  post.Id = result.TopLevelPostId
345
- post.Group = &fbcrawl.FacebookGroup{Id: result.PageId, Name: dataElement.Find("h3 strong:last-child a").Text()}
346
- post.User = &fbcrawl.FacebookUser{
348
+ post.Group = &pb.FacebookGroup{Id: result.PageId, Name: dataElement.Find("h3 strong:last-child a").Text()}
349
+ post.User = &pb.FacebookUser{
347
350
  Id: result.ContentOwnerIdNew,
348
351
  Name: dataElement.Find("h3 strong:first-child a").Text(),
349
352
  }
@@ -362,15 +365,15 @@ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.Facebo
362
365
 
363
366
  post.ContentLink = getUrlFromRedirectHref(dataElement.Find("a[href*=\"https://lm.facebook.com/l.php\"]").AttrOr("href", ""))
364
367
  post.ReactionCount = getNumberFromText(element.DOM.Find("div[id*=\"sentence_\"]").Text())
365
- post.ContentImages = (funk.Map(result.PhotoAttachmentsList, func(id string) *fbcrawl.FacebookImage {
368
+ post.ContentImages = (funk.Map(result.PhotoAttachmentsList, func(id string) *pb.FacebookImage {
366
369
  i, _ := strconv.ParseInt(id, 10, 64)
367
- return &fbcrawl.FacebookImage{
370
+ return &pb.FacebookImage{
368
371
  Id: i,
369
372
  }
370
- })).([]*fbcrawl.FacebookImage)
373
+ })).([]*pb.FacebookImage)
371
374
 
372
375
  if result.PhotoId > 0 {
373
- post.ContentImage = &fbcrawl.FacebookImage{Id: result.PhotoId}
376
+ post.ContentImage = &pb.FacebookImage{Id: result.PhotoId}
374
377
  }
375
378
  }
376
379
 
@@ -380,10 +383,10 @@ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.Facebo
380
383
  commentId, _ := strconv.ParseInt(selection.AttrOr("id", ""), 10, 64)
381
384
  logger.Info("comment", commentId)
382
385
  createdAtWhenResult, _ := f.w.Parse(selection.Find("abbr").Text(), time.Now())
383
- post.Comments = append(post.Comments, &fbcrawl.FacebookComment{
386
+ post.Comments.Comments = append(post.Comments.Comments, &pb.FacebookComment{
384
387
  Id: commentId,
385
- Post: &fbcrawl.FacebookPost{Id: post.Id},
386
- User: &fbcrawl.FacebookUser{
388
+ Post: &pb.FacebookPost{Id: post.Id},
389
+ User: &pb.FacebookUser{
387
390
  Id: getUserIdFromCommentHref(selection.Find("a[href*=\"#comment_form_\"]").AttrOr("href", "")),
388
391
  Name: selection.Find("h3 > a").Text(),
389
392
  },
@@ -396,14 +399,14 @@ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.Facebo
396
399
  })
397
400
 
398
401
  collector.OnHTML("div[id*=\"see_prev_\"] > a", func(element *colly.HTMLElement) {
399
- if commentPaging < 3 {
400
- logger.Info("Comment paging", commentPaging)
401
- err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
402
- commentPaging = commentPaging + 1
403
- }
402
+ post.Comments.NextCursor = "http://mbasic.facebook.com" + element.Attr("href")
404
403
  })
404
+ if len(commentNextCursor) > 0 {
405
+ err = collector.Visit(commentNextCursor)
406
+ } else {
407
+ err = collector.Visit(fmt.Sprintf("http://mbasic.facebook.com/groups/%d?view=permalink&id=%d&_rdr", groupId, postId))
408
+ }
405
409
 
406
- err = collector.Visit(fmt.Sprintf("http://mbasic.facebook.com/groups/%d?view=permalink&id=%d&_rdr", groupId, postId))
407
410
  return err, post
408
411
  }
409
412
 
@@ -417,8 +420,12 @@ func (f *Fbcolly) LoginWithCookies(cookies string) error {
417
420
  //}
418
421
 
419
422
  func getUserIdFromCommentHref(href string) int64 {
420
- id, _ := strconv.ParseInt(regexp.MustCompile("#comment_form_(\\d+)").FindStringSubmatch(href)[1], 10, 64)
421
- return id
423
+ match := regexp.MustCompile("#comment_form_(\\d+)").FindStringSubmatch(href)
424
+ if len(match) > 0 {
425
+ id, _ := strconv.ParseInt(match[1], 10, 64)
426
+ return id
427
+ }
428
+ return 0
422
429
  }
423
430
 
424
431
  func getUrlFromRedirectHref(href string) string {