fbcrawl-colly 0.2.2 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 383de209c502265a8533af201e3b9538edffc115af1575bf9662508abb142e00
4
- data.tar.gz: 631c53a8c82e063df262ccac4f6c74f4591296b700ffa627110a9bf0dce40358
3
+ metadata.gz: eed929533973a523ed9cc5ac0cabf35559cb2e50e1c028a7ef2d8b90de15b58b
4
+ data.tar.gz: 385c7dcd852e31acdc3d591b323ba4eb2b8d979b89c8fe55a55e18b00fbbb691
5
5
  SHA512:
6
- metadata.gz: b8c1505f7183c0a30a6d6679319a947be589828771e3bc231915cb841aad5e79b7221091862f6df537a701eaa2bf21bb996ba689e301d00a79964fd47e45f258
7
- data.tar.gz: 2ce2e0870f9455cdaeab48c840e1c9f5fb35afc340e0c56830afca90eb7bcebcf120430b7c3d043d58274d545ab0c7bb99d4062b36885b1e832bb540b7bf5080
6
+ metadata.gz: 16777a1a0ae7d48b2ce7ac3bd3573a351903321a009e81d649379aa8f508986ae4511d0ed85101c8605910d383dfa9ba8b7748d759ab05c7a7076e32c5cf49cd
7
+ data.tar.gz: d902d6226148cadd62d1e0c5a45419cfdd1343831bcfd5dced56bd298982a51d1da1835b048925b347351a01e63e875e8b295e568fa803b512c57183484fd2c6
data/.gitignore CHANGED
@@ -12,8 +12,6 @@
12
12
  /parse.log
13
13
  last.html
14
14
  *.db
15
- /fbcrawl/fbcrawl.pb.go
16
- /lib/fbcrawl_pb.rb
17
15
 
18
16
  mkmf.log
19
17
  .rakeTasks
@@ -0,0 +1,14 @@
1
+ FROM golang:1.14-alpine
2
+ RUN apk add --no-cache git build-base tzdata
3
+
4
+ RUN mkdir -p /app
5
+ WORKDIR /app
6
+ ADD ./go.mod /app
7
+ ADD ./go.sum /app
8
+ ADD ./ /app
9
+ RUN go get
10
+
11
+
12
+ ENV PORT 3000
13
+ RUN go build -o server qnetwork.net/fbcrawl
14
+ ENTRYPOINT ["./server"]
@@ -1,19 +1,21 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- fbcrawl-colly (0.2.2)
5
- ffi
4
+ fbcrawl-colly (1.0.1)
6
5
  google-protobuf
6
+ grpc
7
7
 
8
8
  GEM
9
9
  remote: https://rubygems.org/
10
10
  specs:
11
- ffi (1.13.1)
12
- google-protobuf (3.12.4)
11
+ google-protobuf (3.13.0)
12
+ googleapis-common-protos-types (1.0.5)
13
+ google-protobuf (~> 3.11)
14
+ grpc (1.30.2)
15
+ google-protobuf (~> 3.12)
16
+ googleapis-common-protos-types (~> 1.0)
13
17
  minitest (5.14.1)
14
18
  rake (12.3.3)
15
- rake-compiler (1.1.1)
16
- rake
17
19
 
18
20
  PLATFORMS
19
21
  ruby
@@ -22,7 +24,6 @@ DEPENDENCIES
22
24
  fbcrawl-colly!
23
25
  minitest (~> 5.0)
24
26
  rake (~> 12.0)
25
- rake-compiler
26
27
 
27
28
  BUNDLED WITH
28
29
  2.1.4
data/README.md CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  This project is to crawl mbasic.facebook.com using GO Colly.
4
4
 
5
- Ruby gem will available to use
5
+ Ruby gem is only client for GRPC colly service
6
6
 
7
7
  ## Installation
8
8
 
@@ -25,12 +25,8 @@ Gem::Specification.new do |spec|
25
25
  end
26
26
  # spec.bindir = "exe"
27
27
  # spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
28
- spec.extensions = [
29
- 'ext/fbcrawl_colly/extconf.rb'
30
- ]
31
- spec.require_paths = ["lib"]
28
+ spec.require_paths = %w[lib lib/pb]
32
29
 
33
- spec.add_runtime_dependency 'ffi'
34
30
  spec.add_runtime_dependency 'google-protobuf'
35
- spec.add_development_dependency 'rake-compiler'
31
+ spec.add_runtime_dependency 'grpc'
36
32
  end
@@ -1,17 +1,92 @@
1
1
  syntax = "proto3";
2
2
 
3
3
  package fbcrawl_colly;
4
- option go_package = "./fbcrawl;fbcrawl";
4
+ option go_package = "./fbcrawl/pb;pb";
5
+
6
+ service Grpc {
7
+ // Sends a greeting
8
+ rpc Init (Empty) returns (Pointer) {}
9
+ rpc FreeColly (Pointer) returns (Empty) {}
10
+ rpc Login (LoginRequest) returns (LoginResponse) {}
11
+ rpc LoginWithCookies (LoginWithCookiesRequest) returns (Empty) {}
12
+ rpc FetchGroupInfo (FetchGroupInfoRequest) returns (FacebookGroup) {}
13
+ rpc FetchUserInfo (FetchUserInfoRequest) returns (FacebookUser) {}
14
+ rpc FetchGroupFeed (FetchGroupFeedRequest) returns (FacebookPostList) {}
15
+ rpc FetchPost (FetchPostRequest) returns (FacebookPost) {}
16
+ rpc FetchContentImages (FetchContentImagesRequest) returns (FacebookImageList) {}
17
+ rpc FetchImageUrl (FetchImageUrlRequest) returns (FacebookImage) {}
18
+ }
19
+
20
+ message Empty {
21
+
22
+ }
23
+
24
+ message Pointer {
25
+ int64 address = 1;
26
+ }
27
+
28
+ message LoginRequest {
29
+ Pointer pointer = 1;
30
+ string email = 2;
31
+ string password = 3;
32
+ string totp_secret = 4;
33
+ }
34
+
35
+ message LoginResponse {
36
+ string cookies = 1;
37
+ }
38
+
39
+ message LoginWithCookiesRequest {
40
+ Pointer pointer = 1;
41
+ string cookies = 2;
42
+ }
43
+
44
+ message FetchGroupInfoRequest {
45
+ Pointer pointer = 1;
46
+ string group_username = 2;
47
+ }
48
+
49
+ message FetchUserInfoRequest {
50
+ Pointer pointer = 1;
51
+ string username = 2;
52
+ }
53
+
54
+ message FetchGroupFeedRequest {
55
+ Pointer pointer = 1;
56
+ int64 group_id = 2;
57
+ string next_cursor = 3;
58
+ }
59
+
60
+ message FetchPostRequest {
61
+ Pointer pointer = 1;
62
+ int64 group_id = 2;
63
+ int64 post_id = 3;
64
+ string comment_next_cursor = 4;
65
+ }
66
+
67
+ message FetchContentImagesRequest {
68
+ Pointer pointer = 1;
69
+ int64 post_id = 2;
70
+ string next_cursor = 3;
71
+ }
72
+
73
+ message FetchImageUrlRequest {
74
+ Pointer pointer = 1;
75
+ int64 image_id = 2;
76
+ }
5
77
 
6
78
  // The request message containing the user's name.
7
79
  message FacebookGroup {
8
80
  int64 id = 1;
9
81
  string name = 2;
82
+ int64 member_count = 3;
10
83
  }
11
84
 
12
85
  message FacebookUser {
13
86
  int64 id = 1;
14
87
  string name = 2;
88
+ string username = 3;
89
+ int64 friend_count =4;
15
90
  }
16
91
 
17
92
  message FacebookPost {
@@ -19,12 +94,18 @@ message FacebookPost {
19
94
  FacebookGroup group = 2;
20
95
  FacebookUser user = 3;
21
96
  string content = 4;
97
+ CommentList comments = 5;
22
98
  string content_link = 6;
23
- FacebookImage content_image = 8;
24
99
  repeated FacebookImage content_images = 7;
25
- repeated FacebookComment comments = 5;
100
+ FacebookImage content_image = 8;
26
101
  int64 created_at = 9;
27
102
  int64 reaction_count = 10;
103
+ int64 comment_count = 11;
104
+ }
105
+
106
+ message CommentList {
107
+ repeated FacebookComment comments = 5;
108
+ string next_cursor = 12;
28
109
  }
29
110
 
30
111
  message FacebookImage {
@@ -42,8 +123,10 @@ message FacebookComment {
42
123
 
43
124
  message FacebookPostList {
44
125
  repeated FacebookPost posts = 1;
126
+ string next_cursor = 2;
45
127
  }
46
128
 
47
129
  message FacebookImageList {
48
130
  repeated FacebookImage images = 1;
131
+ string next_cursor = 2;
49
132
  }
@@ -6,16 +6,18 @@ import (
6
6
  "errors"
7
7
  "fmt"
8
8
  "github.com/PuerkitoBio/goquery"
9
- "github.com/gocolly/colly"
10
- "github.com/gocolly/colly/extensions"
11
- "github.com/gocolly/colly/storage"
9
+ "github.com/gocolly/colly/v2"
10
+ "github.com/gocolly/colly/v2/debug"
11
+ "github.com/gocolly/colly/v2/extensions"
12
+ "github.com/gocolly/colly/v2/storage"
12
13
  "github.com/google/logger"
13
14
  "github.com/olebedev/when"
14
15
  "github.com/olebedev/when/rules/common"
15
16
  "github.com/olebedev/when/rules/en"
16
17
  "github.com/thoas/go-funk"
18
+ "github.com/xlzd/gotp"
17
19
  "net/url"
18
- "qnetwork.net/fbcrawl/fbcrawl"
20
+ "qnetwork.net/fbcrawl/fbcrawl/pb"
19
21
  "regexp"
20
22
  "strconv"
21
23
  "strings"
@@ -33,7 +35,7 @@ type FbDataInsight struct {
33
35
  FbDataPostContext `json:"post_context"`
34
36
  }
35
37
  type FbDataFt struct {
36
- ContentOwnerIdNew int64 `json:"content_owner_id_new"`
38
+ ContentOwnerIdNew json.Number `json:"content_owner_id_new"`
37
39
  PhotoAttachmentsList []string `json:"photo_attachments_list"`
38
40
  PhotoId int64 `json:"photo_id,string"`
39
41
  PageId int64 `json:"page_id,string"`
@@ -42,7 +44,7 @@ type FbDataFt struct {
42
44
  }
43
45
 
44
46
  func sharedOnRequest(request *colly.Request) {
45
- logger.Info("OnRequest")
47
+ logger.Info("OnRequest ", request.URL)
46
48
  //request.Headers.Set("Host", "facebook.com")
47
49
  request.Headers.Set("Accept-Language", "en-US,en;q=0.9")
48
50
  request.Headers.Set("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
@@ -65,6 +67,7 @@ func setupSharedCollector(collector *colly.Collector) error {
65
67
  collector.AllowURLRevisit = true
66
68
  collector.OnRequest(sharedOnRequest)
67
69
  collector.OnResponse(sharedOnResponse)
70
+ collector.SetDebugger(&debug.LogDebugger{})
68
71
  collector.OnError(func(resp *colly.Response, errHttp error) {
69
72
  err = errHttp
70
73
  logger.Error("OnError", err)
@@ -106,26 +109,29 @@ func New() *Fbcolly {
106
109
  return &f
107
110
  }
108
111
 
109
- func (f *Fbcolly) Login(email string, password string, otp string) (string, error) {
112
+ func (f *Fbcolly) Login(email string, password string, totpSecret string) (string, error) {
110
113
  collector := f.collector.Clone()
111
114
  err := setupSharedCollector(collector)
112
115
 
113
116
  logger.Info("Login using email", email)
114
117
  loggedIn := false
115
-
118
+ firstLogin := true
116
119
  collector.OnHTML("#login_form", func(element *colly.HTMLElement) {
117
- logger.Info("OnHTML login_form")
118
- loginURL, err, reqMap := getForm(element, err)
119
- if err != nil {
120
- logger.Error(err)
121
- return
122
- }
123
- reqMap["email"] = email
124
- reqMap["pass"] = password
125
- logger.Info("req map:", reqMap)
126
- err = collector.Post(loginURL, reqMap)
127
- if err != nil {
128
- logger.Error("post err:", err)
120
+ if firstLogin {
121
+ firstLogin = false
122
+ logger.Info("OnHTML login_form")
123
+ loginURL, err, reqMap := getForm(element, err)
124
+ if err != nil {
125
+ logger.Error(err)
126
+ return
127
+ }
128
+ reqMap["email"] = email
129
+ reqMap["pass"] = password
130
+ logger.Info("req map:", reqMap)
131
+ err = collector.Post(loginURL, reqMap)
132
+ if err != nil {
133
+ logger.Error("post err:", err)
134
+ }
129
135
  }
130
136
  })
131
137
 
@@ -152,9 +158,12 @@ func (f *Fbcolly) Login(email string, password string, otp string) (string, erro
152
158
  //logger.Info("Please input OTP")
153
159
  //reader := bufio.NewReader(os.Stdin)
154
160
  //code, _ := reader.ReadString('\n')
155
- code := otp[0:6]
156
- reqMap["approvals_code"] = code
157
- shouldSubmit = true
161
+ if len(totpSecret) > 0 {
162
+ code := gotp.NewDefaultTOTP(totpSecret).Now()
163
+ reqMap["approvals_code"] = code
164
+ shouldSubmit = true
165
+ }
166
+
158
167
  } else {
159
168
  logger.Info("OnHTML Only Continue checkpoint")
160
169
 
@@ -188,68 +197,152 @@ func (f *Fbcolly) Login(email string, password string, otp string) (string, erro
188
197
 
189
198
  }
190
199
 
191
- func (f *Fbcolly) FetchGroupFeed(groupId int64) (error, *fbcrawl.FacebookPostList) {
200
+ func (f *Fbcolly) FetchGroupFeed(groupId int64, nextCursor string) (error, *pb.FacebookPostList) {
192
201
  collector := f.collector.Clone()
193
202
  err := setupSharedCollector(collector)
194
- currentPage := 1
195
- var result []*fbcrawl.FacebookPost
203
+ result := pb.FacebookPostList{Posts: []*pb.FacebookPost{}}
196
204
 
197
205
  collector.OnHTML("#m_group_stories_container > :last-child a", func(element *colly.HTMLElement) {
198
- currentPage++
199
- if currentPage < 3 {
200
- logger.Info("Will fetch page", currentPage)
201
- err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
206
+ result.NextCursor = "http://mbasic.facebook.com" + element.Attr("href")
207
+ })
208
+ collector.OnHTML("#m_group_stories_container div[role=\"article\"]", func(element *colly.HTMLElement) {
209
+ dataElement := element
210
+ post := &pb.FacebookPost{}
211
+ var fbDataFt FbDataFt
212
+ jsonData := dataElement.Attr("data-ft")
213
+
214
+ logger.Info(jsonData)
215
+ err = json.Unmarshal([]byte(jsonData), &fbDataFt)
216
+ if err != nil {
217
+ logger.Error(err)
218
+ return
219
+ }
220
+ logger.Info("Post ", fbDataFt)
221
+ post.Id = fbDataFt.TopLevelPostId
222
+ post.Group = &pb.FacebookGroup{Id: fbDataFt.PageId, Name: dataElement.DOM.Find("h3 strong:nth-child(2) a").Text()}
223
+ userId, _ := fbDataFt.ContentOwnerIdNew.Int64()
224
+ post.User = &pb.FacebookUser{
225
+ Id: userId,
226
+ Name: dataElement.DOM.Find("h3 strong:nth-child(1) a").Text(),
202
227
  }
228
+ post.CreatedAt = fbDataFt.PageInsights[strconv.FormatInt(fbDataFt.PageId, 10)].PublishTime
229
+ //Content
230
+
231
+ //NO BACKGROUND TEXT ONLY
232
+ post.Content = strings.Join(dataElement.DOM.Find("p").Map(func(i int, selection *goquery.Selection) string {
233
+ return selection.Text()
234
+ }), "\n")
235
+
236
+ if len(post.Content) == 0 {
237
+ // TEXT WITH BACKGROUND
238
+ post.Content = dataElement.DOM.Find("div[style*=\"background-image:url\"]").Text()
239
+ }
240
+
241
+ post.ContentLink = getUrlFromRedirectHref(dataElement.DOM.Find("a[href*=\"https://lm.facebook.com/l.php\"]").AttrOr("href", ""))
242
+ post.ReactionCount = getNumberFromText(element.DOM.Find("span[id*=\"like_\"]").Text())
243
+ post.CommentCount = getNumberFromText(element.DOM.Find("span[id*=\"like_\"] ~ a").Text())
244
+ post.ContentImages = (funk.Map(fbDataFt.PhotoAttachmentsList, func(id string) *pb.FacebookImage {
245
+ i, _ := strconv.ParseInt(id, 10, 64)
246
+ return &pb.FacebookImage{
247
+ Id: i,
248
+ }
249
+ })).([]*pb.FacebookImage)
250
+
251
+ if fbDataFt.PhotoId > 0 {
252
+ post.ContentImage = &pb.FacebookImage{Id: fbDataFt.PhotoId}
253
+ }
254
+ result.Posts = append(result.Posts, post)
203
255
  })
256
+ if len(nextCursor) > 0 {
257
+ err = collector.Visit(nextCursor)
258
+ } else {
259
+ err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/groups/%d", groupId))
260
+ }
204
261
 
205
- collector.OnXML("//a[text()=\"Full Story\"]", func(element *colly.XMLElement) {
206
- logger.Info("Post found at", element.Attr("href"))
207
- u, _ := url.Parse("http://mbasic.facebook.com" + element.Attr("href"))
208
- postId, _ := strconv.ParseInt(u.Query().Get("id"), 10, 64)
209
- result = append(result, &fbcrawl.FacebookPost{
210
- Id: postId,
211
- Group: &fbcrawl.FacebookGroup{Id: groupId},
212
- })
213
- //f.detailCollector.Visit(url)
262
+ if err != nil {
263
+ logger.Error("crawl by colly err:", err)
264
+ }
265
+ return err, &result
266
+ }
267
+
268
+ func (f *Fbcolly) FetchUserInfo(userIdOrUsername string) (error, *pb.FacebookUser) {
269
+ collector := f.collector.Clone()
270
+ err := setupSharedCollector(collector)
271
+
272
+ result := &pb.FacebookUser{}
273
+
274
+ collector.OnHTML("a[href*=\"lst=\"]", func(element *colly.HTMLElement) {
275
+ parsed, _ := url.Parse(element.Attr("href"))
276
+ result.Username = strings.Split(parsed.Path[1:], "/")[0]
277
+ result.Id = getUserIdFromCommentHref(parsed.Query().Get("lst"))
278
+ })
279
+
280
+ collector.OnHTML("a[href*=\"/friends\"]", func(element *colly.HTMLElement) {
281
+ result.FriendCount = getNumberFromText(element.Text)
282
+ })
283
+
284
+ collector.OnHTML("#objects_container", func(element *colly.HTMLElement) {
285
+ result.Name = element.DOM.Find("strong").First().Text()
286
+ })
287
+
288
+ err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/%s", userIdOrUsername))
289
+ if err != nil {
290
+ logger.Error("crawl by colly err:", err)
291
+ }
292
+ return err, result
293
+ }
294
+
295
+ func (f *Fbcolly) FetchGroupInfo(groupIdOrUsername string) (error, *pb.FacebookGroup) {
296
+ collector := f.collector.Clone()
297
+ err := setupSharedCollector(collector)
298
+ result := &pb.FacebookGroup{}
299
+
300
+ collector.OnHTML("a[href=\"#groupMenuBottom\"] h1", func(element *colly.HTMLElement) {
301
+ result.Name = element.Text
302
+ })
303
+ collector.OnHTML("a[href*=\"view=member\"]", func(element *colly.HTMLElement) {
304
+ result.Id = getNumberFromText(element.Attr("href"))
305
+ result.MemberCount, _ = strconv.ParseInt(element.DOM.Closest("tr").Find("td:last-child").Text(), 10, 64)
214
306
  })
215
307
 
216
- err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/groups/%d", groupId))
308
+ err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/groups/%s?view=info", groupIdOrUsername))
217
309
  if err != nil {
218
310
  logger.Error("crawl by colly err:", err)
219
311
  }
220
- return err, &fbcrawl.FacebookPostList{Posts: result}
312
+ return err, result
221
313
  }
222
314
 
223
- func (f *Fbcolly) FetchContentImages(postId int64) (error, *fbcrawl.FacebookImageList) {
315
+ func (f *Fbcolly) FetchContentImages(postId int64, nextCursor string) (error, *pb.FacebookImageList) {
224
316
  collector := f.collector.Clone()
225
317
  err := setupSharedCollector(collector)
226
- currentPage := 1
227
- var result []*fbcrawl.FacebookImage
318
+ result := pb.FacebookImageList{Images: []*pb.FacebookImage{}}
228
319
 
229
320
  collector.OnHTML("a[href*=\"/media/set/\"]", func(element *colly.HTMLElement) {
230
- currentPage++
231
- logger.Info("Will fetch page", currentPage)
232
- err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
321
+ result.NextCursor = "http://mbasic.facebook.com" + element.Attr("href")
233
322
  })
234
323
 
235
324
  collector.OnHTML("a[href*=\"/photo.php\"]", func(element *colly.HTMLElement) {
236
- result = append(result, &fbcrawl.FacebookImage{
325
+ result.Images = append(result.Images, &pb.FacebookImage{
237
326
  Id: getImageIdFromHref(element.Attr("href")),
238
327
  })
239
328
  //f.detailCollector.Visit(url)
240
329
  })
330
+ if len(nextCursor) > 0 {
331
+ err = collector.Visit(nextCursor)
332
+ } else {
333
+ err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/media/set/?set=pcb.%d", postId))
334
+ }
241
335
 
242
- err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/media/set/?set=pcb.%d", postId))
243
336
  if err != nil {
244
337
  logger.Error("crawl by colly err:", err)
245
338
  }
246
- return err, &fbcrawl.FacebookImageList{Images: result}
339
+ return err, &result
247
340
  }
248
341
 
249
- func (f *Fbcolly) FetchImageUrl(imageId int64) (error, *fbcrawl.FacebookImage) {
342
+ func (f *Fbcolly) FetchImageUrl(imageId int64) (error, *pb.FacebookImage) {
250
343
  collector := f.collector.Clone()
251
344
  err := setupSharedCollector(collector)
252
- result := fbcrawl.FacebookImage{Id: imageId}
345
+ result := pb.FacebookImage{Id: imageId}
253
346
 
254
347
  collector.OnHTML("a", func(element *colly.HTMLElement) {
255
348
  result.Url = element.Attr("href")
@@ -262,11 +355,10 @@ func (f *Fbcolly) FetchImageUrl(imageId int64) (error, *fbcrawl.FacebookImage) {
262
355
  return err, &result
263
356
  }
264
357
 
265
- func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.FacebookPost) {
358
+ func (f *Fbcolly) FetchPost(groupId int64, postId int64, commentNextCursor string) (error, *pb.FacebookPost) {
266
359
  collector := f.collector.Clone()
267
360
  err := setupSharedCollector(collector)
268
- post := &fbcrawl.FacebookPost{Comments: []*fbcrawl.FacebookComment{}}
269
- commentPaging := 0
361
+ post := &pb.FacebookPost{Comments: &pb.CommentList{Comments: []*pb.FacebookComment{}}}
270
362
  collector.OnHTML("#m_story_permalink_view", func(element *colly.HTMLElement) {
271
363
  dataElement := element.DOM.Find("div[data-ft]")
272
364
  if dataElement.Length() > 0 {
@@ -281,9 +373,10 @@ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.Facebo
281
373
  }
282
374
  logger.Info("Post ", result)
283
375
  post.Id = result.TopLevelPostId
284
- post.Group = &fbcrawl.FacebookGroup{Id: result.PageId, Name: dataElement.Find("h3 strong:last-child a").Text()}
285
- post.User = &fbcrawl.FacebookUser{
286
- Id: result.ContentOwnerIdNew,
376
+ post.Group = &pb.FacebookGroup{Id: result.PageId, Name: dataElement.Find("h3 strong:last-child a").Text()}
377
+ userId, _ := result.ContentOwnerIdNew.Int64()
378
+ post.User = &pb.FacebookUser{
379
+ Id: userId,
287
380
  Name: dataElement.Find("h3 strong:first-child a").Text(),
288
381
  }
289
382
  post.CreatedAt = result.PageInsights[strconv.FormatInt(result.PageId, 10)].PublishTime
@@ -300,21 +393,17 @@ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.Facebo
300
393
  }
301
394
 
302
395
  post.ContentLink = getUrlFromRedirectHref(dataElement.Find("a[href*=\"https://lm.facebook.com/l.php\"]").AttrOr("href", ""))
303
- post.ReactionCount = getReactionFromText(element.DOM.Find("div[id*=\"sentence_\"]").Text())
304
- post.ContentImages = (funk.Map(result.PhotoAttachmentsList, func(id string) *fbcrawl.FacebookImage {
396
+ post.ReactionCount = getNumberFromText(element.DOM.Find("div[id*=\"sentence_\"]").Text())
397
+ post.ContentImages = (funk.Map(result.PhotoAttachmentsList, func(id string) *pb.FacebookImage {
305
398
  i, _ := strconv.ParseInt(id, 10, 64)
306
- return &fbcrawl.FacebookImage{
399
+ return &pb.FacebookImage{
307
400
  Id: i,
308
401
  }
309
- })).([]*fbcrawl.FacebookImage)
402
+ })).([]*pb.FacebookImage)
310
403
 
311
404
  if result.PhotoId > 0 {
312
- post.ContentImage = &fbcrawl.FacebookImage{Id: result.PhotoId}
405
+ post.ContentImage = &pb.FacebookImage{Id: result.PhotoId}
313
406
  }
314
-
315
- logger.Info("content", strings.Join(dataElement.Find("p").Map(func(i int, selection *goquery.Selection) string {
316
- return selection.Text()
317
- }), "\n"))
318
407
  }
319
408
 
320
409
  //Comment
@@ -323,12 +412,13 @@ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.Facebo
323
412
  commentId, _ := strconv.ParseInt(selection.AttrOr("id", ""), 10, 64)
324
413
  logger.Info("comment", commentId)
325
414
  createdAtWhenResult, _ := f.w.Parse(selection.Find("abbr").Text(), time.Now())
326
- post.Comments = append(post.Comments, &fbcrawl.FacebookComment{
415
+ parsed, _ := url.Parse(selection.Find("h3 > a").AttrOr("href", ""))
416
+ post.Comments.Comments = append(post.Comments.Comments, &pb.FacebookComment{
327
417
  Id: commentId,
328
- Post: &fbcrawl.FacebookPost{Id: post.Id},
329
- User: &fbcrawl.FacebookUser{
330
- Id: getUserIdFromCommentHref(selection.Find("a[href*=\"#comment_form_\"]").AttrOr("href", "")),
331
- Name: selection.Find("h3 > a").Text(),
418
+ Post: &pb.FacebookPost{Id: post.Id},
419
+ User: &pb.FacebookUser{
420
+ Username: parsed.Path[1:],
421
+ Name: selection.Find("h3 > a").Text(),
332
422
  },
333
423
  Content: selection.Find("h3 + div").Text(),
334
424
  CreatedAt: createdAtWhenResult.Time.Unix(),
@@ -339,14 +429,14 @@ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.Facebo
339
429
  })
340
430
 
341
431
  collector.OnHTML("div[id*=\"see_prev_\"] > a", func(element *colly.HTMLElement) {
342
- if commentPaging < 3 {
343
- logger.Info("Comment paging", commentPaging)
344
- err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
345
- commentPaging = commentPaging + 1
346
- }
432
+ post.Comments.NextCursor = "http://mbasic.facebook.com" + element.Attr("href")
347
433
  })
434
+ if len(commentNextCursor) > 0 {
435
+ err = collector.Visit(commentNextCursor)
436
+ } else {
437
+ err = collector.Visit(fmt.Sprintf("http://mbasic.facebook.com/groups/%d?view=permalink&id=%d&_rdr", groupId, postId))
438
+ }
348
439
 
349
- err = collector.Visit(fmt.Sprintf("http://mbasic.facebook.com/groups/%d?view=permalink&id=%d&_rdr", groupId, postId))
350
440
  return err, post
351
441
  }
352
442
 
@@ -360,8 +450,12 @@ func (f *Fbcolly) LoginWithCookies(cookies string) error {
360
450
  //}
361
451
 
362
452
  func getUserIdFromCommentHref(href string) int64 {
363
- id, _ := strconv.ParseInt(regexp.MustCompile("#comment_form_(\\d+)").FindStringSubmatch(href)[1], 10, 64)
364
- return id
453
+ match := regexp.MustCompile("\\d+:(\\d+)\\d+").FindStringSubmatch(href)
454
+ if len(match) > 0 {
455
+ id, _ := strconv.ParseInt(match[1], 10, 64)
456
+ return id
457
+ }
458
+ return 0
365
459
  }
366
460
 
367
461
  func getUrlFromRedirectHref(href string) string {
@@ -375,11 +469,20 @@ func getImageIdFromHref(href string) int64 {
375
469
  return i
376
470
  }
377
471
 
378
- func getReactionFromText(text string) int64 {
472
+ func getNumberFromText(text string) int64 {
379
473
  logger.Error("reaction", text)
380
474
  if len(text) > 0 {
381
- id, _ := strconv.ParseInt(regexp.MustCompile("(\\d+)").FindStringSubmatch(text)[1], 10, 64)
382
- return id
475
+ match := regexp.MustCompile("(\\d+)\\s?([km]?)").FindStringSubmatch(text)
476
+ if len(match) > 0 {
477
+ count, _ := strconv.ParseInt(match[1], 10, 64)
478
+ switch match[2] {
479
+ case "k":
480
+ count *= 1000
481
+ case "m":
482
+ count *= 1000000
483
+ }
484
+ return count
485
+ }
383
486
  }
384
487
  return 0
385
488
  }