fbcrawl-colly 0.2.3 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6e43a203c1c38131f0d9eb32b67a1859bd1530aac9249751ff0329a5901388f6
4
- data.tar.gz: f3520762f00d2b1a2d475a9b97d2141fe8cd98faf718ccf3d8a797e61489f976
3
+ metadata.gz: 138016b11ed36541b96961dbfad0627b5d8bccd26a7638f95e5a8806d0fadf31
4
+ data.tar.gz: 376c58bc102d6ece54631f0151889d7546af436e9f40f64d994ac44b7110e29e
5
5
  SHA512:
6
- metadata.gz: 2560aa6a3239671a14f82a95034bbad03684dd44b0fc32cb2092b404678b2ec0e124a33db64a92f5c22f4cd3d9d94a42ccc40a6ab2e1e362088d0f02f05d22f3
7
- data.tar.gz: 305f3d463d75bc3edcfcd1b5a985d9a9882e990e850786132eb1b6d4e93f4e3e31a2c0e9aeb23141cc0633cd1d5ad8e1d88ec4c0e3e4d2d201a6298a7c8699c6
6
+ metadata.gz: d8580cf0cf980989aab899d4f1185a33dff13743cfedea5487e59f8a990cd23d8c9c4751e18c2192b396c27c4ef5d5e95919404522fc4399b9754c1c0753f63a
7
+ data.tar.gz: 84b182c8dcfd4a3b5aa7297c6cd3bca485a4f372d65579f03220c32bcf2fcda1d95be02cff4caf5c0bb88a997daa53684b6f25fc8c088be4e7e31406c4f1f4ba
data/.gitignore CHANGED
@@ -12,8 +12,6 @@
12
12
  /parse.log
13
13
  last.html
14
14
  *.db
15
- /fbcrawl/fbcrawl.pb.go
16
- /lib/fbcrawl_pb.rb
17
15
 
18
16
  mkmf.log
19
17
  .rakeTasks
@@ -0,0 +1,14 @@
1
+ FROM golang:1.14-alpine
2
+ RUN apk add --no-cache git build-base tzdata
3
+
4
+ RUN mkdir -p /app
5
+ WORKDIR /app
6
+ ADD ./go.mod /app
7
+ ADD ./go.sum /app
8
+ ADD ./ /app
9
+ RUN go get
10
+
11
+
12
+ ENV PORT 3000
13
+ RUN go build -o server qnetwork.net/fbcrawl
14
+ ENTRYPOINT ["./server"]
@@ -1,19 +1,21 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- fbcrawl-colly (0.2.3)
5
- ffi
4
+ fbcrawl-colly (1.1.0)
6
5
  google-protobuf
6
+ grpc
7
7
 
8
8
  GEM
9
9
  remote: https://rubygems.org/
10
10
  specs:
11
- ffi (1.13.1)
12
- google-protobuf (3.12.4)
11
+ google-protobuf (3.13.0)
12
+ googleapis-common-protos-types (1.0.5)
13
+ google-protobuf (~> 3.11)
14
+ grpc (1.31.1)
15
+ google-protobuf (~> 3.12)
16
+ googleapis-common-protos-types (~> 1.0)
13
17
  minitest (5.14.1)
14
18
  rake (12.3.3)
15
- rake-compiler (1.1.1)
16
- rake
17
19
 
18
20
  PLATFORMS
19
21
  ruby
@@ -22,7 +24,6 @@ DEPENDENCIES
22
24
  fbcrawl-colly!
23
25
  minitest (~> 5.0)
24
26
  rake (~> 12.0)
25
- rake-compiler
26
27
 
27
28
  BUNDLED WITH
28
29
  2.1.4
data/README.md CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  This project is to crawl mbasic.facebook.com using GO Colly.
4
4
 
5
- Ruby gem will available to use
5
+ Ruby gem is only client for GRPC colly service
6
6
 
7
7
  ## Installation
8
8
 
@@ -25,12 +25,8 @@ Gem::Specification.new do |spec|
25
25
  end
26
26
  # spec.bindir = "exe"
27
27
  # spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
28
- spec.extensions = [
29
- 'ext/fbcrawl_colly/extconf.rb'
30
- ]
31
- spec.require_paths = ["lib"]
28
+ spec.require_paths = %w[lib lib/pb]
32
29
 
33
- spec.add_runtime_dependency 'ffi'
34
30
  spec.add_runtime_dependency 'google-protobuf'
35
- spec.add_development_dependency 'rake-compiler'
31
+ spec.add_runtime_dependency 'grpc'
36
32
  end
@@ -1,17 +1,87 @@
1
1
  syntax = "proto3";
2
2
 
3
3
  package fbcrawl_colly;
4
- option go_package = "./fbcrawl;fbcrawl";
4
+ option go_package = "./fbcrawl/pb;pb";
5
+
6
+ service Grpc {
7
+ // Sends a greeting
8
+ rpc Login (LoginRequest) returns (LoginResponse) {}
9
+ rpc FetchGroupInfo (FetchGroupInfoRequest) returns (FacebookGroup) {}
10
+ rpc FetchUserInfo (FetchUserInfoRequest) returns (FacebookUser) {}
11
+ rpc FetchGroupFeed (FetchGroupFeedRequest) returns (FacebookPostList) {}
12
+ rpc FetchPost (FetchPostRequest) returns (FacebookPost) {}
13
+ rpc FetchContentImages (FetchContentImagesRequest) returns (FacebookImageList) {}
14
+ rpc FetchImageUrl (FetchImageUrlRequest) returns (FacebookImage) {}
15
+ }
16
+
17
+ message Empty {
18
+
19
+ }
20
+
21
+ message Context {
22
+ string cookies = 1;
23
+ }
24
+
25
+ message LoginRequest {
26
+ string email = 2;
27
+ string password = 3;
28
+ string totp_secret = 4;
29
+ }
30
+
31
+ message LoginResponse {
32
+ string cookies = 1;
33
+ }
34
+
35
+ message LoginWithCookiesRequest {
36
+ string cookies = 1;
37
+ }
38
+
39
+ message FetchGroupInfoRequest {
40
+ Context context = 1;
41
+ string group_username = 2;
42
+ }
43
+
44
+ message FetchUserInfoRequest {
45
+ Context context = 1;
46
+ string username = 2;
47
+ }
48
+
49
+ message FetchGroupFeedRequest {
50
+ Context context = 1;
51
+ int64 group_id = 2;
52
+ string next_cursor = 3;
53
+ }
54
+
55
+ message FetchPostRequest {
56
+ Context context = 1;
57
+ int64 group_id = 2;
58
+ int64 post_id = 3;
59
+ string comment_next_cursor = 4;
60
+ }
61
+
62
+ message FetchContentImagesRequest {
63
+ Context context = 1;
64
+ int64 post_id = 2;
65
+ string next_cursor = 3;
66
+ }
67
+
68
+ message FetchImageUrlRequest {
69
+ Context context = 1;
70
+ int64 image_id = 2;
71
+ }
5
72
 
6
73
  // The request message containing the user's name.
7
74
  message FacebookGroup {
8
75
  int64 id = 1;
9
76
  string name = 2;
77
+ int64 member_count = 3;
10
78
  }
11
79
 
12
80
  message FacebookUser {
13
81
  int64 id = 1;
14
82
  string name = 2;
83
+ string username = 3;
84
+ int64 friend_count = 4;
15
85
  }
16
86
 
17
87
  message FacebookPost {
@@ -19,15 +89,20 @@ message FacebookPost {
19
89
  FacebookGroup group = 2;
20
90
  FacebookUser user = 3;
21
91
  string content = 4;
92
+ CommentList comments = 5;
22
93
  string content_link = 6;
23
- FacebookImage content_image = 8;
24
94
  repeated FacebookImage content_images = 7;
25
- repeated FacebookComment comments = 5;
95
+ FacebookImage content_image = 8;
26
96
  int64 created_at = 9;
27
97
  int64 reaction_count = 10;
28
98
  int64 comment_count = 11;
29
99
  }
30
100
 
101
+ message CommentList {
102
+ repeated FacebookComment comments = 5;
103
+ string next_cursor = 12;
104
+ }
105
+
31
106
  message FacebookImage {
32
107
  int64 id = 1;
33
108
  string url = 2;
@@ -43,8 +118,10 @@ message FacebookComment {
43
118
 
44
119
  message FacebookPostList {
45
120
  repeated FacebookPost posts = 1;
121
+ string next_cursor = 2;
46
122
  }
47
123
 
48
124
  message FacebookImageList {
49
125
  repeated FacebookImage images = 1;
126
+ string next_cursor = 2;
50
127
  }
@@ -6,16 +6,17 @@ import (
6
6
  "errors"
7
7
  "fmt"
8
8
  "github.com/PuerkitoBio/goquery"
9
- "github.com/gocolly/colly"
10
- "github.com/gocolly/colly/extensions"
11
- "github.com/gocolly/colly/storage"
9
+ "github.com/gocolly/colly/v2"
10
+ "github.com/gocolly/colly/v2/extensions"
11
+ "github.com/gocolly/colly/v2/storage"
12
12
  "github.com/google/logger"
13
13
  "github.com/olebedev/when"
14
14
  "github.com/olebedev/when/rules/common"
15
15
  "github.com/olebedev/when/rules/en"
16
16
  "github.com/thoas/go-funk"
17
+ "github.com/xlzd/gotp"
17
18
  "net/url"
18
- "qnetwork.net/fbcrawl/fbcrawl"
19
+ "qnetwork.net/fbcrawl/fbcrawl/pb"
19
20
  "regexp"
20
21
  "strconv"
21
22
  "strings"
@@ -33,7 +34,7 @@ type FbDataInsight struct {
33
34
  FbDataPostContext `json:"post_context"`
34
35
  }
35
36
  type FbDataFt struct {
36
- ContentOwnerIdNew int64 `json:"content_owner_id_new"`
37
+ ContentOwnerIdNew json.Number `json:"content_owner_id_new"`
37
38
  PhotoAttachmentsList []string `json:"photo_attachments_list"`
38
39
  PhotoId int64 `json:"photo_id,string"`
39
40
  PageId int64 `json:"page_id,string"`
@@ -42,7 +43,7 @@ type FbDataFt struct {
42
43
  }
43
44
 
44
45
  func sharedOnRequest(request *colly.Request) {
45
- logger.Info("OnRequest")
46
+ logger.Info("OnRequest ", request.URL)
46
47
  //request.Headers.Set("Host", "facebook.com")
47
48
  request.Headers.Set("Accept-Language", "en-US,en;q=0.9")
48
49
  request.Headers.Set("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
@@ -106,26 +107,29 @@ func New() *Fbcolly {
106
107
  return &f
107
108
  }
108
109
 
109
- func (f *Fbcolly) Login(email string, password string, otp string) (string, error) {
110
+ func (f *Fbcolly) Login(email string, password string, totpSecret string) (string, error) {
110
111
  collector := f.collector.Clone()
111
112
  err := setupSharedCollector(collector)
112
113
 
113
114
  logger.Info("Login using email", email)
114
115
  loggedIn := false
115
-
116
+ firstLogin := true
116
117
  collector.OnHTML("#login_form", func(element *colly.HTMLElement) {
117
- logger.Info("OnHTML login_form")
118
- loginURL, err, reqMap := getForm(element, err)
119
- if err != nil {
120
- logger.Error(err)
121
- return
122
- }
123
- reqMap["email"] = email
124
- reqMap["pass"] = password
125
- logger.Info("req map:", reqMap)
126
- err = collector.Post(loginURL, reqMap)
127
- if err != nil {
128
- logger.Error("post err:", err)
118
+ if firstLogin {
119
+ firstLogin = false
120
+ logger.Info("OnHTML login_form")
121
+ loginURL, err, reqMap := getForm(element, err)
122
+ if err != nil {
123
+ logger.Error(err)
124
+ return
125
+ }
126
+ reqMap["email"] = email
127
+ reqMap["pass"] = password
128
+ logger.Info("req map:", reqMap)
129
+ err = collector.Post(loginURL, reqMap)
130
+ if err != nil {
131
+ logger.Error("post err:", err)
132
+ }
129
133
  }
130
134
  })
131
135
 
@@ -152,9 +156,12 @@ func (f *Fbcolly) Login(email string, password string, otp string) (string, erro
152
156
  //logger.Info("Please input OTP")
153
157
  //reader := bufio.NewReader(os.Stdin)
154
158
  //code, _ := reader.ReadString('\n')
155
- code := otp[0:6]
156
- reqMap["approvals_code"] = code
157
- shouldSubmit = true
159
+ if len(totpSecret) > 0 {
160
+ code := gotp.NewDefaultTOTP(totpSecret).Now()
161
+ reqMap["approvals_code"] = code
162
+ shouldSubmit = true
163
+ }
164
+
158
165
  } else {
159
166
  logger.Info("OnHTML Only Continue checkpoint")
160
167
 
@@ -188,22 +195,17 @@ func (f *Fbcolly) Login(email string, password string, otp string) (string, erro
188
195
 
189
196
  }
190
197
 
191
- func (f *Fbcolly) FetchGroupFeed(groupId int64) (error, *fbcrawl.FacebookPostList) {
198
+ func (f *Fbcolly) FetchGroupFeed(groupId int64, nextCursor string) (error, *pb.FacebookPostList) {
192
199
  collector := f.collector.Clone()
193
200
  err := setupSharedCollector(collector)
194
- currentPage := 1
195
- var result []*fbcrawl.FacebookPost
201
+ result := pb.FacebookPostList{Posts: []*pb.FacebookPost{}}
196
202
 
197
203
  collector.OnHTML("#m_group_stories_container > :last-child a", func(element *colly.HTMLElement) {
198
- currentPage++
199
- if currentPage < 3 {
200
- logger.Info("Will fetch page", currentPage)
201
- err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
202
- }
204
+ result.NextCursor = "http://mbasic.facebook.com" + element.Attr("href")
203
205
  })
204
- collector.OnHTML("div[role=\"article\"]", func(element *colly.HTMLElement) {
206
+ collector.OnHTML("#m_group_stories_container div[role=\"article\"]", func(element *colly.HTMLElement) {
205
207
  dataElement := element
206
- post := &fbcrawl.FacebookPost{}
208
+ post := &pb.FacebookPost{}
207
209
  var fbDataFt FbDataFt
208
210
  jsonData := dataElement.Attr("data-ft")
209
211
 
@@ -215,9 +217,10 @@ func (f *Fbcolly) FetchGroupFeed(groupId int64) (error, *fbcrawl.FacebookPostLis
215
217
  }
216
218
  logger.Info("Post ", fbDataFt)
217
219
  post.Id = fbDataFt.TopLevelPostId
218
- post.Group = &fbcrawl.FacebookGroup{Id: fbDataFt.PageId, Name: dataElement.DOM.Find("h3 strong:nth-child(2) a").Text()}
219
- post.User = &fbcrawl.FacebookUser{
220
- Id: fbDataFt.ContentOwnerIdNew,
220
+ post.Group = &pb.FacebookGroup{Id: fbDataFt.PageId, Name: dataElement.DOM.Find("h3 strong:nth-child(2) a").Text()}
221
+ userId, _ := fbDataFt.ContentOwnerIdNew.Int64()
222
+ post.User = &pb.FacebookUser{
223
+ Id: userId,
221
224
  Name: dataElement.DOM.Find("h3 strong:nth-child(1) a").Text(),
222
225
  }
223
226
  post.CreatedAt = fbDataFt.PageInsights[strconv.FormatInt(fbDataFt.PageId, 10)].PublishTime
@@ -236,56 +239,108 @@ func (f *Fbcolly) FetchGroupFeed(groupId int64) (error, *fbcrawl.FacebookPostLis
236
239
  post.ContentLink = getUrlFromRedirectHref(dataElement.DOM.Find("a[href*=\"https://lm.facebook.com/l.php\"]").AttrOr("href", ""))
237
240
  post.ReactionCount = getNumberFromText(element.DOM.Find("span[id*=\"like_\"]").Text())
238
241
  post.CommentCount = getNumberFromText(element.DOM.Find("span[id*=\"like_\"] ~ a").Text())
239
- post.ContentImages = (funk.Map(fbDataFt.PhotoAttachmentsList, func(id string) *fbcrawl.FacebookImage {
242
+ post.ContentImages = (funk.Map(fbDataFt.PhotoAttachmentsList, func(id string) *pb.FacebookImage {
240
243
  i, _ := strconv.ParseInt(id, 10, 64)
241
- return &fbcrawl.FacebookImage{
244
+ return &pb.FacebookImage{
242
245
  Id: i,
243
246
  }
244
- })).([]*fbcrawl.FacebookImage)
247
+ })).([]*pb.FacebookImage)
245
248
 
246
249
  if fbDataFt.PhotoId > 0 {
247
- post.ContentImage = &fbcrawl.FacebookImage{Id: fbDataFt.PhotoId}
250
+ post.ContentImage = &pb.FacebookImage{Id: fbDataFt.PhotoId}
248
251
  }
249
- result = append(result, post)
252
+ result.Posts = append(result.Posts, post)
253
+ })
254
+ if len(nextCursor) > 0 {
255
+ err = collector.Visit(nextCursor)
256
+ } else {
257
+ err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/groups/%d", groupId))
258
+ }
259
+
260
+ if err != nil {
261
+ logger.Error("crawl by colly err:", err)
262
+ }
263
+ return err, &result
264
+ }
265
+
266
+ func (f *Fbcolly) FetchUserInfo(userIdOrUsername string) (error, *pb.FacebookUser) {
267
+ collector := f.collector.Clone()
268
+ err := setupSharedCollector(collector)
269
+
270
+ result := &pb.FacebookUser{}
271
+
272
+ collector.OnHTML("a[href*=\"lst=\"]", func(element *colly.HTMLElement) {
273
+ parsed, _ := url.Parse(element.Attr("href"))
274
+ result.Username = strings.Split(parsed.Path[1:], "/")[0]
275
+ result.Id = getUserIdFromCommentHref(parsed.Query().Get("lst"))
276
+ })
277
+
278
+ collector.OnHTML("a[href*=\"/friends\"]", func(element *colly.HTMLElement) {
279
+ result.FriendCount = getNumberFromText(element.Text)
250
280
  })
251
281
 
252
- err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/groups/%d", groupId))
282
+ collector.OnHTML("#objects_container", func(element *colly.HTMLElement) {
283
+ result.Name = element.DOM.Find("strong").First().Text()
284
+ })
285
+
286
+ err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/%s", userIdOrUsername))
253
287
  if err != nil {
254
288
  logger.Error("crawl by colly err:", err)
255
289
  }
256
- return err, &fbcrawl.FacebookPostList{Posts: result}
290
+ return err, result
257
291
  }
258
292
 
259
- func (f *Fbcolly) FetchContentImages(postId int64) (error, *fbcrawl.FacebookImageList) {
293
+ func (f *Fbcolly) FetchGroupInfo(groupIdOrUsername string) (error, *pb.FacebookGroup) {
260
294
  collector := f.collector.Clone()
261
295
  err := setupSharedCollector(collector)
262
- currentPage := 1
263
- var result []*fbcrawl.FacebookImage
296
+ result := &pb.FacebookGroup{}
297
+
298
+ collector.OnHTML("a[href=\"#groupMenuBottom\"] h1", func(element *colly.HTMLElement) {
299
+ result.Name = element.Text
300
+ })
301
+ collector.OnHTML("a[href*=\"view=member\"]", func(element *colly.HTMLElement) {
302
+ result.Id = getNumberFromText(element.Attr("href"))
303
+ result.MemberCount, _ = strconv.ParseInt(element.DOM.Closest("tr").Find("td:last-child").Text(), 10, 64)
304
+ })
305
+
306
+ err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/groups/%s?view=info", groupIdOrUsername))
307
+ if err != nil {
308
+ logger.Error("crawl by colly err:", err)
309
+ }
310
+ return err, result
311
+ }
312
+
313
+ func (f *Fbcolly) FetchContentImages(postId int64, nextCursor string) (error, *pb.FacebookImageList) {
314
+ collector := f.collector.Clone()
315
+ err := setupSharedCollector(collector)
316
+ result := pb.FacebookImageList{Images: []*pb.FacebookImage{}}
264
317
 
265
318
  collector.OnHTML("a[href*=\"/media/set/\"]", func(element *colly.HTMLElement) {
266
- currentPage++
267
- logger.Info("Will fetch page", currentPage)
268
- err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
319
+ result.NextCursor = "http://mbasic.facebook.com" + element.Attr("href")
269
320
  })
270
321
 
271
322
  collector.OnHTML("a[href*=\"/photo.php\"]", func(element *colly.HTMLElement) {
272
- result = append(result, &fbcrawl.FacebookImage{
323
+ result.Images = append(result.Images, &pb.FacebookImage{
273
324
  Id: getImageIdFromHref(element.Attr("href")),
274
325
  })
275
326
  //f.detailCollector.Visit(url)
276
327
  })
328
+ if len(nextCursor) > 0 {
329
+ err = collector.Visit(nextCursor)
330
+ } else {
331
+ err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/media/set/?set=pcb.%d", postId))
332
+ }
277
333
 
278
- err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/media/set/?set=pcb.%d", postId))
279
334
  if err != nil {
280
335
  logger.Error("crawl by colly err:", err)
281
336
  }
282
- return err, &fbcrawl.FacebookImageList{Images: result}
337
+ return err, &result
283
338
  }
284
339
 
285
- func (f *Fbcolly) FetchImageUrl(imageId int64) (error, *fbcrawl.FacebookImage) {
340
+ func (f *Fbcolly) FetchImageUrl(imageId int64) (error, *pb.FacebookImage) {
286
341
  collector := f.collector.Clone()
287
342
  err := setupSharedCollector(collector)
288
- result := fbcrawl.FacebookImage{Id: imageId}
343
+ result := pb.FacebookImage{Id: imageId}
289
344
 
290
345
  collector.OnHTML("a", func(element *colly.HTMLElement) {
291
346
  result.Url = element.Attr("href")
@@ -298,11 +353,10 @@ func (f *Fbcolly) FetchImageUrl(imageId int64) (error, *fbcrawl.FacebookImage) {
298
353
  return err, &result
299
354
  }
300
355
 
301
- func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.FacebookPost) {
356
+ func (f *Fbcolly) FetchPost(groupId int64, postId int64, commentNextCursor string) (error, *pb.FacebookPost) {
302
357
  collector := f.collector.Clone()
303
358
  err := setupSharedCollector(collector)
304
- post := &fbcrawl.FacebookPost{Comments: []*fbcrawl.FacebookComment{}}
305
- commentPaging := 0
359
+ post := &pb.FacebookPost{Comments: &pb.CommentList{Comments: []*pb.FacebookComment{}}}
306
360
  collector.OnHTML("#m_story_permalink_view", func(element *colly.HTMLElement) {
307
361
  dataElement := element.DOM.Find("div[data-ft]")
308
362
  if dataElement.Length() > 0 {
@@ -317,9 +371,10 @@ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.Facebo
317
371
  }
318
372
  logger.Info("Post ", result)
319
373
  post.Id = result.TopLevelPostId
320
- post.Group = &fbcrawl.FacebookGroup{Id: result.PageId, Name: dataElement.Find("h3 strong:last-child a").Text()}
321
- post.User = &fbcrawl.FacebookUser{
322
- Id: result.ContentOwnerIdNew,
374
+ post.Group = &pb.FacebookGroup{Id: result.PageId, Name: dataElement.Find("h3 strong:last-child a").Text()}
375
+ userId, _ := result.ContentOwnerIdNew.Int64()
376
+ post.User = &pb.FacebookUser{
377
+ Id: userId,
323
378
  Name: dataElement.Find("h3 strong:first-child a").Text(),
324
379
  }
325
380
  post.CreatedAt = result.PageInsights[strconv.FormatInt(result.PageId, 10)].PublishTime
@@ -337,15 +392,15 @@ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.Facebo
337
392
 
338
393
  post.ContentLink = getUrlFromRedirectHref(dataElement.Find("a[href*=\"https://lm.facebook.com/l.php\"]").AttrOr("href", ""))
339
394
  post.ReactionCount = getNumberFromText(element.DOM.Find("div[id*=\"sentence_\"]").Text())
340
- post.ContentImages = (funk.Map(result.PhotoAttachmentsList, func(id string) *fbcrawl.FacebookImage {
395
+ post.ContentImages = (funk.Map(result.PhotoAttachmentsList, func(id string) *pb.FacebookImage {
341
396
  i, _ := strconv.ParseInt(id, 10, 64)
342
- return &fbcrawl.FacebookImage{
397
+ return &pb.FacebookImage{
343
398
  Id: i,
344
399
  }
345
- })).([]*fbcrawl.FacebookImage)
400
+ })).([]*pb.FacebookImage)
346
401
 
347
402
  if result.PhotoId > 0 {
348
- post.ContentImage = &fbcrawl.FacebookImage{Id: result.PhotoId}
403
+ post.ContentImage = &pb.FacebookImage{Id: result.PhotoId}
349
404
  }
350
405
  }
351
406
 
@@ -355,30 +410,33 @@ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.Facebo
355
410
  commentId, _ := strconv.ParseInt(selection.AttrOr("id", ""), 10, 64)
356
411
  logger.Info("comment", commentId)
357
412
  createdAtWhenResult, _ := f.w.Parse(selection.Find("abbr").Text(), time.Now())
358
- post.Comments = append(post.Comments, &fbcrawl.FacebookComment{
359
- Id: commentId,
360
- Post: &fbcrawl.FacebookPost{Id: post.Id},
361
- User: &fbcrawl.FacebookUser{
362
- Id: getUserIdFromCommentHref(selection.Find("a[href*=\"#comment_form_\"]").AttrOr("href", "")),
363
- Name: selection.Find("h3 > a").Text(),
364
- },
365
- Content: selection.Find("h3 + div").Text(),
366
- CreatedAt: createdAtWhenResult.Time.Unix(),
367
- })
413
+ parsed, _ := url.Parse(selection.Find("h3 > a").AttrOr("href", ""))
414
+ if len(parsed.Path) > 1 {
415
+ post.Comments.Comments = append(post.Comments.Comments, &pb.FacebookComment{
416
+ Id: commentId,
417
+ Post: &pb.FacebookPost{Id: post.Id},
418
+ User: &pb.FacebookUser{
419
+ Username: parsed.Path[1:],
420
+ Name: selection.Find("h3 > a").Text(),
421
+ },
422
+ Content: selection.Find("h3 + div").Text(),
423
+ CreatedAt: createdAtWhenResult.Time.Unix(),
424
+ })
425
+ }
368
426
  })
369
427
 
370
428
  }
371
429
  })
372
430
 
373
431
  collector.OnHTML("div[id*=\"see_prev_\"] > a", func(element *colly.HTMLElement) {
374
- if commentPaging < 3 {
375
- logger.Info("Comment paging", commentPaging)
376
- err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
377
- commentPaging = commentPaging + 1
378
- }
432
+ post.Comments.NextCursor = "http://mbasic.facebook.com" + element.Attr("href")
379
433
  })
434
+ if len(commentNextCursor) > 0 {
435
+ err = collector.Visit(commentNextCursor)
436
+ } else {
437
+ err = collector.Visit(fmt.Sprintf("http://mbasic.facebook.com/groups/%d?view=permalink&id=%d&_rdr", groupId, postId))
438
+ }
380
439
 
381
- err = collector.Visit(fmt.Sprintf("http://mbasic.facebook.com/groups/%d?view=permalink&id=%d&_rdr", groupId, postId))
382
440
  return err, post
383
441
  }
384
442
 
@@ -392,8 +450,12 @@ func (f *Fbcolly) LoginWithCookies(cookies string) error {
392
450
  //}
393
451
 
394
452
  func getUserIdFromCommentHref(href string) int64 {
395
- id, _ := strconv.ParseInt(regexp.MustCompile("#comment_form_(\\d+)").FindStringSubmatch(href)[1], 10, 64)
396
- return id
453
+ match := regexp.MustCompile("\\d+:(\\d+)\\d+").FindStringSubmatch(href)
454
+ if len(match) > 0 {
455
+ id, _ := strconv.ParseInt(match[1], 10, 64)
456
+ return id
457
+ }
458
+ return 0
397
459
  }
398
460
 
399
461
  func getUrlFromRedirectHref(href string) string {
@@ -410,15 +472,17 @@ func getImageIdFromHref(href string) int64 {
410
472
  func getNumberFromText(text string) int64 {
411
473
  logger.Error("reaction", text)
412
474
  if len(text) > 0 {
413
- match := regexp.MustCompile("(\\d*)\\s?([km]?)").FindStringSubmatch(text)
414
- count, _ := strconv.ParseInt(match[1], 10, 64)
415
- switch match[2] {
416
- case "k":
417
- count *= 1000
418
- case "m":
419
- count *= 1000000
475
+ match := regexp.MustCompile("(\\d+)\\s?([km]?)").FindStringSubmatch(text)
476
+ if len(match) > 0 {
477
+ count, _ := strconv.ParseInt(match[1], 10, 64)
478
+ switch match[2] {
479
+ case "k":
480
+ count *= 1000
481
+ case "m":
482
+ count *= 1000000
483
+ }
484
+ return count
420
485
  }
421
- return count
422
486
  }
423
487
  return 0
424
488
  }