fbcrawl-colly 0.2.3 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6e43a203c1c38131f0d9eb32b67a1859bd1530aac9249751ff0329a5901388f6
4
- data.tar.gz: f3520762f00d2b1a2d475a9b97d2141fe8cd98faf718ccf3d8a797e61489f976
3
+ metadata.gz: 138016b11ed36541b96961dbfad0627b5d8bccd26a7638f95e5a8806d0fadf31
4
+ data.tar.gz: 376c58bc102d6ece54631f0151889d7546af436e9f40f64d994ac44b7110e29e
5
5
  SHA512:
6
- metadata.gz: 2560aa6a3239671a14f82a95034bbad03684dd44b0fc32cb2092b404678b2ec0e124a33db64a92f5c22f4cd3d9d94a42ccc40a6ab2e1e362088d0f02f05d22f3
7
- data.tar.gz: 305f3d463d75bc3edcfcd1b5a985d9a9882e990e850786132eb1b6d4e93f4e3e31a2c0e9aeb23141cc0633cd1d5ad8e1d88ec4c0e3e4d2d201a6298a7c8699c6
6
+ metadata.gz: d8580cf0cf980989aab899d4f1185a33dff13743cfedea5487e59f8a990cd23d8c9c4751e18c2192b396c27c4ef5d5e95919404522fc4399b9754c1c0753f63a
7
+ data.tar.gz: 84b182c8dcfd4a3b5aa7297c6cd3bca485a4f372d65579f03220c32bcf2fcda1d95be02cff4caf5c0bb88a997daa53684b6f25fc8c088be4e7e31406c4f1f4ba
data/.gitignore CHANGED
@@ -12,8 +12,6 @@
12
12
  /parse.log
13
13
  last.html
14
14
  *.db
15
- /fbcrawl/fbcrawl.pb.go
16
- /lib/fbcrawl_pb.rb
17
15
 
18
16
  mkmf.log
19
17
  .rakeTasks
@@ -0,0 +1,14 @@
1
+ FROM golang:1.14-alpine
2
+ RUN apk add --no-cache git build-base tzdata
3
+
4
+ RUN mkdir -p /app
5
+ WORKDIR /app
6
+ ADD ./go.mod /app
7
+ ADD ./go.sum /app
8
+ ADD ./ /app
9
+ RUN go get
10
+
11
+
12
+ ENV PORT 3000
13
+ RUN go build -o server qnetwork.net/fbcrawl
14
+ ENTRYPOINT ["./server"]
@@ -1,19 +1,21 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- fbcrawl-colly (0.2.3)
5
- ffi
4
+ fbcrawl-colly (1.1.0)
6
5
  google-protobuf
6
+ grpc
7
7
 
8
8
  GEM
9
9
  remote: https://rubygems.org/
10
10
  specs:
11
- ffi (1.13.1)
12
- google-protobuf (3.12.4)
11
+ google-protobuf (3.13.0)
12
+ googleapis-common-protos-types (1.0.5)
13
+ google-protobuf (~> 3.11)
14
+ grpc (1.31.1)
15
+ google-protobuf (~> 3.12)
16
+ googleapis-common-protos-types (~> 1.0)
13
17
  minitest (5.14.1)
14
18
  rake (12.3.3)
15
- rake-compiler (1.1.1)
16
- rake
17
19
 
18
20
  PLATFORMS
19
21
  ruby
@@ -22,7 +24,6 @@ DEPENDENCIES
22
24
  fbcrawl-colly!
23
25
  minitest (~> 5.0)
24
26
  rake (~> 12.0)
25
- rake-compiler
26
27
 
27
28
  BUNDLED WITH
28
29
  2.1.4
data/README.md CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  This project is to crawl mbasic.facebook.com using GO Colly.
4
4
 
5
- Ruby gem will available to use
5
+ Ruby gem is only client for GRPC colly service
6
6
 
7
7
  ## Installation
8
8
 
@@ -25,12 +25,8 @@ Gem::Specification.new do |spec|
25
25
  end
26
26
  # spec.bindir = "exe"
27
27
  # spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
28
- spec.extensions = [
29
- 'ext/fbcrawl_colly/extconf.rb'
30
- ]
31
- spec.require_paths = ["lib"]
28
+ spec.require_paths = %w[lib lib/pb]
32
29
 
33
- spec.add_runtime_dependency 'ffi'
34
30
  spec.add_runtime_dependency 'google-protobuf'
35
- spec.add_development_dependency 'rake-compiler'
31
+ spec.add_runtime_dependency 'grpc'
36
32
  end
@@ -1,17 +1,87 @@
1
1
  syntax = "proto3";
2
2
 
3
3
  package fbcrawl_colly;
4
- option go_package = "./fbcrawl;fbcrawl";
4
+ option go_package = "./fbcrawl/pb;pb";
5
+
6
+ service Grpc {
7
+ // Sends a greeting
8
+ rpc Login (LoginRequest) returns (LoginResponse) {}
9
+ rpc FetchGroupInfo (FetchGroupInfoRequest) returns (FacebookGroup) {}
10
+ rpc FetchUserInfo (FetchUserInfoRequest) returns (FacebookUser) {}
11
+ rpc FetchGroupFeed (FetchGroupFeedRequest) returns (FacebookPostList) {}
12
+ rpc FetchPost (FetchPostRequest) returns (FacebookPost) {}
13
+ rpc FetchContentImages (FetchContentImagesRequest) returns (FacebookImageList) {}
14
+ rpc FetchImageUrl (FetchImageUrlRequest) returns (FacebookImage) {}
15
+ }
16
+
17
+ message Empty {
18
+
19
+ }
20
+
21
+ message Context {
22
+ string cookies = 1;
23
+ }
24
+
25
+ message LoginRequest {
26
+ string email = 2;
27
+ string password = 3;
28
+ string totp_secret = 4;
29
+ }
30
+
31
+ message LoginResponse {
32
+ string cookies = 1;
33
+ }
34
+
35
+ message LoginWithCookiesRequest {
36
+ string cookies = 1;
37
+ }
38
+
39
+ message FetchGroupInfoRequest {
40
+ Context context = 1;
41
+ string group_username = 2;
42
+ }
43
+
44
+ message FetchUserInfoRequest {
45
+ Context context = 1;
46
+ string username = 2;
47
+ }
48
+
49
+ message FetchGroupFeedRequest {
50
+ Context context = 1;
51
+ int64 group_id = 2;
52
+ string next_cursor = 3;
53
+ }
54
+
55
+ message FetchPostRequest {
56
+ Context context = 1;
57
+ int64 group_id = 2;
58
+ int64 post_id = 3;
59
+ string comment_next_cursor = 4;
60
+ }
61
+
62
+ message FetchContentImagesRequest {
63
+ Context context = 1;
64
+ int64 post_id = 2;
65
+ string next_cursor = 3;
66
+ }
67
+
68
+ message FetchImageUrlRequest {
69
+ Context context = 1;
70
+ int64 image_id = 2;
71
+ }
5
72
 
6
73
  // The request message containing the user's name.
7
74
  message FacebookGroup {
8
75
  int64 id = 1;
9
76
  string name = 2;
77
+ int64 member_count = 3;
10
78
  }
11
79
 
12
80
  message FacebookUser {
13
81
  int64 id = 1;
14
82
  string name = 2;
83
+ string username = 3;
84
+ int64 friend_count = 4;
15
85
  }
16
86
 
17
87
  message FacebookPost {
@@ -19,15 +89,20 @@ message FacebookPost {
19
89
  FacebookGroup group = 2;
20
90
  FacebookUser user = 3;
21
91
  string content = 4;
92
+ CommentList comments = 5;
22
93
  string content_link = 6;
23
- FacebookImage content_image = 8;
24
94
  repeated FacebookImage content_images = 7;
25
- repeated FacebookComment comments = 5;
95
+ FacebookImage content_image = 8;
26
96
  int64 created_at = 9;
27
97
  int64 reaction_count = 10;
28
98
  int64 comment_count = 11;
29
99
  }
30
100
 
101
+ message CommentList {
102
+ repeated FacebookComment comments = 5;
103
+ string next_cursor = 12;
104
+ }
105
+
31
106
  message FacebookImage {
32
107
  int64 id = 1;
33
108
  string url = 2;
@@ -43,8 +118,10 @@ message FacebookComment {
43
118
 
44
119
  message FacebookPostList {
45
120
  repeated FacebookPost posts = 1;
121
+ string next_cursor = 2;
46
122
  }
47
123
 
48
124
  message FacebookImageList {
49
125
  repeated FacebookImage images = 1;
126
+ string next_cursor = 2;
50
127
  }
@@ -6,16 +6,17 @@ import (
6
6
  "errors"
7
7
  "fmt"
8
8
  "github.com/PuerkitoBio/goquery"
9
- "github.com/gocolly/colly"
10
- "github.com/gocolly/colly/extensions"
11
- "github.com/gocolly/colly/storage"
9
+ "github.com/gocolly/colly/v2"
10
+ "github.com/gocolly/colly/v2/extensions"
11
+ "github.com/gocolly/colly/v2/storage"
12
12
  "github.com/google/logger"
13
13
  "github.com/olebedev/when"
14
14
  "github.com/olebedev/when/rules/common"
15
15
  "github.com/olebedev/when/rules/en"
16
16
  "github.com/thoas/go-funk"
17
+ "github.com/xlzd/gotp"
17
18
  "net/url"
18
- "qnetwork.net/fbcrawl/fbcrawl"
19
+ "qnetwork.net/fbcrawl/fbcrawl/pb"
19
20
  "regexp"
20
21
  "strconv"
21
22
  "strings"
@@ -33,7 +34,7 @@ type FbDataInsight struct {
33
34
  FbDataPostContext `json:"post_context"`
34
35
  }
35
36
  type FbDataFt struct {
36
- ContentOwnerIdNew int64 `json:"content_owner_id_new"`
37
+ ContentOwnerIdNew json.Number `json:"content_owner_id_new"`
37
38
  PhotoAttachmentsList []string `json:"photo_attachments_list"`
38
39
  PhotoId int64 `json:"photo_id,string"`
39
40
  PageId int64 `json:"page_id,string"`
@@ -42,7 +43,7 @@ type FbDataFt struct {
42
43
  }
43
44
 
44
45
  func sharedOnRequest(request *colly.Request) {
45
- logger.Info("OnRequest")
46
+ logger.Info("OnRequest ", request.URL)
46
47
  //request.Headers.Set("Host", "facebook.com")
47
48
  request.Headers.Set("Accept-Language", "en-US,en;q=0.9")
48
49
  request.Headers.Set("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
@@ -106,26 +107,29 @@ func New() *Fbcolly {
106
107
  return &f
107
108
  }
108
109
 
109
- func (f *Fbcolly) Login(email string, password string, otp string) (string, error) {
110
+ func (f *Fbcolly) Login(email string, password string, totpSecret string) (string, error) {
110
111
  collector := f.collector.Clone()
111
112
  err := setupSharedCollector(collector)
112
113
 
113
114
  logger.Info("Login using email", email)
114
115
  loggedIn := false
115
-
116
+ firstLogin := true
116
117
  collector.OnHTML("#login_form", func(element *colly.HTMLElement) {
117
- logger.Info("OnHTML login_form")
118
- loginURL, err, reqMap := getForm(element, err)
119
- if err != nil {
120
- logger.Error(err)
121
- return
122
- }
123
- reqMap["email"] = email
124
- reqMap["pass"] = password
125
- logger.Info("req map:", reqMap)
126
- err = collector.Post(loginURL, reqMap)
127
- if err != nil {
128
- logger.Error("post err:", err)
118
+ if firstLogin {
119
+ firstLogin = false
120
+ logger.Info("OnHTML login_form")
121
+ loginURL, err, reqMap := getForm(element, err)
122
+ if err != nil {
123
+ logger.Error(err)
124
+ return
125
+ }
126
+ reqMap["email"] = email
127
+ reqMap["pass"] = password
128
+ logger.Info("req map:", reqMap)
129
+ err = collector.Post(loginURL, reqMap)
130
+ if err != nil {
131
+ logger.Error("post err:", err)
132
+ }
129
133
  }
130
134
  })
131
135
 
@@ -152,9 +156,12 @@ func (f *Fbcolly) Login(email string, password string, otp string) (string, erro
152
156
  //logger.Info("Please input OTP")
153
157
  //reader := bufio.NewReader(os.Stdin)
154
158
  //code, _ := reader.ReadString('\n')
155
- code := otp[0:6]
156
- reqMap["approvals_code"] = code
157
- shouldSubmit = true
159
+ if len(totpSecret) > 0 {
160
+ code := gotp.NewDefaultTOTP(totpSecret).Now()
161
+ reqMap["approvals_code"] = code
162
+ shouldSubmit = true
163
+ }
164
+
158
165
  } else {
159
166
  logger.Info("OnHTML Only Continue checkpoint")
160
167
 
@@ -188,22 +195,17 @@ func (f *Fbcolly) Login(email string, password string, otp string) (string, erro
188
195
 
189
196
  }
190
197
 
191
- func (f *Fbcolly) FetchGroupFeed(groupId int64) (error, *fbcrawl.FacebookPostList) {
198
+ func (f *Fbcolly) FetchGroupFeed(groupId int64, nextCursor string) (error, *pb.FacebookPostList) {
192
199
  collector := f.collector.Clone()
193
200
  err := setupSharedCollector(collector)
194
- currentPage := 1
195
- var result []*fbcrawl.FacebookPost
201
+ result := pb.FacebookPostList{Posts: []*pb.FacebookPost{}}
196
202
 
197
203
  collector.OnHTML("#m_group_stories_container > :last-child a", func(element *colly.HTMLElement) {
198
- currentPage++
199
- if currentPage < 3 {
200
- logger.Info("Will fetch page", currentPage)
201
- err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
202
- }
204
+ result.NextCursor = "http://mbasic.facebook.com" + element.Attr("href")
203
205
  })
204
- collector.OnHTML("div[role=\"article\"]", func(element *colly.HTMLElement) {
206
+ collector.OnHTML("#m_group_stories_container div[role=\"article\"]", func(element *colly.HTMLElement) {
205
207
  dataElement := element
206
- post := &fbcrawl.FacebookPost{}
208
+ post := &pb.FacebookPost{}
207
209
  var fbDataFt FbDataFt
208
210
  jsonData := dataElement.Attr("data-ft")
209
211
 
@@ -215,9 +217,10 @@ func (f *Fbcolly) FetchGroupFeed(groupId int64) (error, *fbcrawl.FacebookPostLis
215
217
  }
216
218
  logger.Info("Post ", fbDataFt)
217
219
  post.Id = fbDataFt.TopLevelPostId
218
- post.Group = &fbcrawl.FacebookGroup{Id: fbDataFt.PageId, Name: dataElement.DOM.Find("h3 strong:nth-child(2) a").Text()}
219
- post.User = &fbcrawl.FacebookUser{
220
- Id: fbDataFt.ContentOwnerIdNew,
220
+ post.Group = &pb.FacebookGroup{Id: fbDataFt.PageId, Name: dataElement.DOM.Find("h3 strong:nth-child(2) a").Text()}
221
+ userId, _ := fbDataFt.ContentOwnerIdNew.Int64()
222
+ post.User = &pb.FacebookUser{
223
+ Id: userId,
221
224
  Name: dataElement.DOM.Find("h3 strong:nth-child(1) a").Text(),
222
225
  }
223
226
  post.CreatedAt = fbDataFt.PageInsights[strconv.FormatInt(fbDataFt.PageId, 10)].PublishTime
@@ -236,56 +239,108 @@ func (f *Fbcolly) FetchGroupFeed(groupId int64) (error, *fbcrawl.FacebookPostLis
236
239
  post.ContentLink = getUrlFromRedirectHref(dataElement.DOM.Find("a[href*=\"https://lm.facebook.com/l.php\"]").AttrOr("href", ""))
237
240
  post.ReactionCount = getNumberFromText(element.DOM.Find("span[id*=\"like_\"]").Text())
238
241
  post.CommentCount = getNumberFromText(element.DOM.Find("span[id*=\"like_\"] ~ a").Text())
239
- post.ContentImages = (funk.Map(fbDataFt.PhotoAttachmentsList, func(id string) *fbcrawl.FacebookImage {
242
+ post.ContentImages = (funk.Map(fbDataFt.PhotoAttachmentsList, func(id string) *pb.FacebookImage {
240
243
  i, _ := strconv.ParseInt(id, 10, 64)
241
- return &fbcrawl.FacebookImage{
244
+ return &pb.FacebookImage{
242
245
  Id: i,
243
246
  }
244
- })).([]*fbcrawl.FacebookImage)
247
+ })).([]*pb.FacebookImage)
245
248
 
246
249
  if fbDataFt.PhotoId > 0 {
247
- post.ContentImage = &fbcrawl.FacebookImage{Id: fbDataFt.PhotoId}
250
+ post.ContentImage = &pb.FacebookImage{Id: fbDataFt.PhotoId}
248
251
  }
249
- result = append(result, post)
252
+ result.Posts = append(result.Posts, post)
253
+ })
254
+ if len(nextCursor) > 0 {
255
+ err = collector.Visit(nextCursor)
256
+ } else {
257
+ err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/groups/%d", groupId))
258
+ }
259
+
260
+ if err != nil {
261
+ logger.Error("crawl by colly err:", err)
262
+ }
263
+ return err, &result
264
+ }
265
+
266
+ func (f *Fbcolly) FetchUserInfo(userIdOrUsername string) (error, *pb.FacebookUser) {
267
+ collector := f.collector.Clone()
268
+ err := setupSharedCollector(collector)
269
+
270
+ result := &pb.FacebookUser{}
271
+
272
+ collector.OnHTML("a[href*=\"lst=\"]", func(element *colly.HTMLElement) {
273
+ parsed, _ := url.Parse(element.Attr("href"))
274
+ result.Username = strings.Split(parsed.Path[1:], "/")[0]
275
+ result.Id = getUserIdFromCommentHref(parsed.Query().Get("lst"))
276
+ })
277
+
278
+ collector.OnHTML("a[href*=\"/friends\"]", func(element *colly.HTMLElement) {
279
+ result.FriendCount = getNumberFromText(element.Text)
250
280
  })
251
281
 
252
- err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/groups/%d", groupId))
282
+ collector.OnHTML("#objects_container", func(element *colly.HTMLElement) {
283
+ result.Name = element.DOM.Find("strong").First().Text()
284
+ })
285
+
286
+ err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/%s", userIdOrUsername))
253
287
  if err != nil {
254
288
  logger.Error("crawl by colly err:", err)
255
289
  }
256
- return err, &fbcrawl.FacebookPostList{Posts: result}
290
+ return err, result
257
291
  }
258
292
 
259
- func (f *Fbcolly) FetchContentImages(postId int64) (error, *fbcrawl.FacebookImageList) {
293
+ func (f *Fbcolly) FetchGroupInfo(groupIdOrUsername string) (error, *pb.FacebookGroup) {
260
294
  collector := f.collector.Clone()
261
295
  err := setupSharedCollector(collector)
262
- currentPage := 1
263
- var result []*fbcrawl.FacebookImage
296
+ result := &pb.FacebookGroup{}
297
+
298
+ collector.OnHTML("a[href=\"#groupMenuBottom\"] h1", func(element *colly.HTMLElement) {
299
+ result.Name = element.Text
300
+ })
301
+ collector.OnHTML("a[href*=\"view=member\"]", func(element *colly.HTMLElement) {
302
+ result.Id = getNumberFromText(element.Attr("href"))
303
+ result.MemberCount, _ = strconv.ParseInt(element.DOM.Closest("tr").Find("td:last-child").Text(), 10, 64)
304
+ })
305
+
306
+ err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/groups/%s?view=info", groupIdOrUsername))
307
+ if err != nil {
308
+ logger.Error("crawl by colly err:", err)
309
+ }
310
+ return err, result
311
+ }
312
+
313
+ func (f *Fbcolly) FetchContentImages(postId int64, nextCursor string) (error, *pb.FacebookImageList) {
314
+ collector := f.collector.Clone()
315
+ err := setupSharedCollector(collector)
316
+ result := pb.FacebookImageList{Images: []*pb.FacebookImage{}}
264
317
 
265
318
  collector.OnHTML("a[href*=\"/media/set/\"]", func(element *colly.HTMLElement) {
266
- currentPage++
267
- logger.Info("Will fetch page", currentPage)
268
- err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
319
+ result.NextCursor = "http://mbasic.facebook.com" + element.Attr("href")
269
320
  })
270
321
 
271
322
  collector.OnHTML("a[href*=\"/photo.php\"]", func(element *colly.HTMLElement) {
272
- result = append(result, &fbcrawl.FacebookImage{
323
+ result.Images = append(result.Images, &pb.FacebookImage{
273
324
  Id: getImageIdFromHref(element.Attr("href")),
274
325
  })
275
326
  //f.detailCollector.Visit(url)
276
327
  })
328
+ if len(nextCursor) > 0 {
329
+ err = collector.Visit(nextCursor)
330
+ } else {
331
+ err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/media/set/?set=pcb.%d", postId))
332
+ }
277
333
 
278
- err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/media/set/?set=pcb.%d", postId))
279
334
  if err != nil {
280
335
  logger.Error("crawl by colly err:", err)
281
336
  }
282
- return err, &fbcrawl.FacebookImageList{Images: result}
337
+ return err, &result
283
338
  }
284
339
 
285
- func (f *Fbcolly) FetchImageUrl(imageId int64) (error, *fbcrawl.FacebookImage) {
340
+ func (f *Fbcolly) FetchImageUrl(imageId int64) (error, *pb.FacebookImage) {
286
341
  collector := f.collector.Clone()
287
342
  err := setupSharedCollector(collector)
288
- result := fbcrawl.FacebookImage{Id: imageId}
343
+ result := pb.FacebookImage{Id: imageId}
289
344
 
290
345
  collector.OnHTML("a", func(element *colly.HTMLElement) {
291
346
  result.Url = element.Attr("href")
@@ -298,11 +353,10 @@ func (f *Fbcolly) FetchImageUrl(imageId int64) (error, *fbcrawl.FacebookImage) {
298
353
  return err, &result
299
354
  }
300
355
 
301
- func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.FacebookPost) {
356
+ func (f *Fbcolly) FetchPost(groupId int64, postId int64, commentNextCursor string) (error, *pb.FacebookPost) {
302
357
  collector := f.collector.Clone()
303
358
  err := setupSharedCollector(collector)
304
- post := &fbcrawl.FacebookPost{Comments: []*fbcrawl.FacebookComment{}}
305
- commentPaging := 0
359
+ post := &pb.FacebookPost{Comments: &pb.CommentList{Comments: []*pb.FacebookComment{}}}
306
360
  collector.OnHTML("#m_story_permalink_view", func(element *colly.HTMLElement) {
307
361
  dataElement := element.DOM.Find("div[data-ft]")
308
362
  if dataElement.Length() > 0 {
@@ -317,9 +371,10 @@ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.Facebo
317
371
  }
318
372
  logger.Info("Post ", result)
319
373
  post.Id = result.TopLevelPostId
320
- post.Group = &fbcrawl.FacebookGroup{Id: result.PageId, Name: dataElement.Find("h3 strong:last-child a").Text()}
321
- post.User = &fbcrawl.FacebookUser{
322
- Id: result.ContentOwnerIdNew,
374
+ post.Group = &pb.FacebookGroup{Id: result.PageId, Name: dataElement.Find("h3 strong:last-child a").Text()}
375
+ userId, _ := result.ContentOwnerIdNew.Int64()
376
+ post.User = &pb.FacebookUser{
377
+ Id: userId,
323
378
  Name: dataElement.Find("h3 strong:first-child a").Text(),
324
379
  }
325
380
  post.CreatedAt = result.PageInsights[strconv.FormatInt(result.PageId, 10)].PublishTime
@@ -337,15 +392,15 @@ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.Facebo
337
392
 
338
393
  post.ContentLink = getUrlFromRedirectHref(dataElement.Find("a[href*=\"https://lm.facebook.com/l.php\"]").AttrOr("href", ""))
339
394
  post.ReactionCount = getNumberFromText(element.DOM.Find("div[id*=\"sentence_\"]").Text())
340
- post.ContentImages = (funk.Map(result.PhotoAttachmentsList, func(id string) *fbcrawl.FacebookImage {
395
+ post.ContentImages = (funk.Map(result.PhotoAttachmentsList, func(id string) *pb.FacebookImage {
341
396
  i, _ := strconv.ParseInt(id, 10, 64)
342
- return &fbcrawl.FacebookImage{
397
+ return &pb.FacebookImage{
343
398
  Id: i,
344
399
  }
345
- })).([]*fbcrawl.FacebookImage)
400
+ })).([]*pb.FacebookImage)
346
401
 
347
402
  if result.PhotoId > 0 {
348
- post.ContentImage = &fbcrawl.FacebookImage{Id: result.PhotoId}
403
+ post.ContentImage = &pb.FacebookImage{Id: result.PhotoId}
349
404
  }
350
405
  }
351
406
 
@@ -355,30 +410,33 @@ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.Facebo
355
410
  commentId, _ := strconv.ParseInt(selection.AttrOr("id", ""), 10, 64)
356
411
  logger.Info("comment", commentId)
357
412
  createdAtWhenResult, _ := f.w.Parse(selection.Find("abbr").Text(), time.Now())
358
- post.Comments = append(post.Comments, &fbcrawl.FacebookComment{
359
- Id: commentId,
360
- Post: &fbcrawl.FacebookPost{Id: post.Id},
361
- User: &fbcrawl.FacebookUser{
362
- Id: getUserIdFromCommentHref(selection.Find("a[href*=\"#comment_form_\"]").AttrOr("href", "")),
363
- Name: selection.Find("h3 > a").Text(),
364
- },
365
- Content: selection.Find("h3 + div").Text(),
366
- CreatedAt: createdAtWhenResult.Time.Unix(),
367
- })
413
+ parsed, _ := url.Parse(selection.Find("h3 > a").AttrOr("href", ""))
414
+ if len(parsed.Path) > 1 {
415
+ post.Comments.Comments = append(post.Comments.Comments, &pb.FacebookComment{
416
+ Id: commentId,
417
+ Post: &pb.FacebookPost{Id: post.Id},
418
+ User: &pb.FacebookUser{
419
+ Username: parsed.Path[1:],
420
+ Name: selection.Find("h3 > a").Text(),
421
+ },
422
+ Content: selection.Find("h3 + div").Text(),
423
+ CreatedAt: createdAtWhenResult.Time.Unix(),
424
+ })
425
+ }
368
426
  })
369
427
 
370
428
  }
371
429
  })
372
430
 
373
431
  collector.OnHTML("div[id*=\"see_prev_\"] > a", func(element *colly.HTMLElement) {
374
- if commentPaging < 3 {
375
- logger.Info("Comment paging", commentPaging)
376
- err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
377
- commentPaging = commentPaging + 1
378
- }
432
+ post.Comments.NextCursor = "http://mbasic.facebook.com" + element.Attr("href")
379
433
  })
434
+ if len(commentNextCursor) > 0 {
435
+ err = collector.Visit(commentNextCursor)
436
+ } else {
437
+ err = collector.Visit(fmt.Sprintf("http://mbasic.facebook.com/groups/%d?view=permalink&id=%d&_rdr", groupId, postId))
438
+ }
380
439
 
381
- err = collector.Visit(fmt.Sprintf("http://mbasic.facebook.com/groups/%d?view=permalink&id=%d&_rdr", groupId, postId))
382
440
  return err, post
383
441
  }
384
442
 
@@ -392,8 +450,12 @@ func (f *Fbcolly) LoginWithCookies(cookies string) error {
392
450
  //}
393
451
 
394
452
  func getUserIdFromCommentHref(href string) int64 {
395
- id, _ := strconv.ParseInt(regexp.MustCompile("#comment_form_(\\d+)").FindStringSubmatch(href)[1], 10, 64)
396
- return id
453
+ match := regexp.MustCompile("\\d+:(\\d+)\\d+").FindStringSubmatch(href)
454
+ if len(match) > 0 {
455
+ id, _ := strconv.ParseInt(match[1], 10, 64)
456
+ return id
457
+ }
458
+ return 0
397
459
  }
398
460
 
399
461
  func getUrlFromRedirectHref(href string) string {
@@ -410,15 +472,17 @@ func getImageIdFromHref(href string) int64 {
410
472
  func getNumberFromText(text string) int64 {
411
473
  logger.Error("reaction", text)
412
474
  if len(text) > 0 {
413
- match := regexp.MustCompile("(\\d*)\\s?([km]?)").FindStringSubmatch(text)
414
- count, _ := strconv.ParseInt(match[1], 10, 64)
415
- switch match[2] {
416
- case "k":
417
- count *= 1000
418
- case "m":
419
- count *= 1000000
475
+ match := regexp.MustCompile("(\\d+)\\s?([km]?)").FindStringSubmatch(text)
476
+ if len(match) > 0 {
477
+ count, _ := strconv.ParseInt(match[1], 10, 64)
478
+ switch match[2] {
479
+ case "k":
480
+ count *= 1000
481
+ case "m":
482
+ count *= 1000000
483
+ }
484
+ return count
420
485
  }
421
- return count
422
486
  }
423
487
  return 0
424
488
  }