fbcrawl-colly 0.2.2 → 1.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 383de209c502265a8533af201e3b9538edffc115af1575bf9662508abb142e00
4
- data.tar.gz: 631c53a8c82e063df262ccac4f6c74f4591296b700ffa627110a9bf0dce40358
3
+ metadata.gz: eed929533973a523ed9cc5ac0cabf35559cb2e50e1c028a7ef2d8b90de15b58b
4
+ data.tar.gz: 385c7dcd852e31acdc3d591b323ba4eb2b8d979b89c8fe55a55e18b00fbbb691
5
5
  SHA512:
6
- metadata.gz: b8c1505f7183c0a30a6d6679319a947be589828771e3bc231915cb841aad5e79b7221091862f6df537a701eaa2bf21bb996ba689e301d00a79964fd47e45f258
7
- data.tar.gz: 2ce2e0870f9455cdaeab48c840e1c9f5fb35afc340e0c56830afca90eb7bcebcf120430b7c3d043d58274d545ab0c7bb99d4062b36885b1e832bb540b7bf5080
6
+ metadata.gz: 16777a1a0ae7d48b2ce7ac3bd3573a351903321a009e81d649379aa8f508986ae4511d0ed85101c8605910d383dfa9ba8b7748d759ab05c7a7076e32c5cf49cd
7
+ data.tar.gz: d902d6226148cadd62d1e0c5a45419cfdd1343831bcfd5dced56bd298982a51d1da1835b048925b347351a01e63e875e8b295e568fa803b512c57183484fd2c6
data/.gitignore CHANGED
@@ -12,8 +12,6 @@
12
12
  /parse.log
13
13
  last.html
14
14
  *.db
15
- /fbcrawl/fbcrawl.pb.go
16
- /lib/fbcrawl_pb.rb
17
15
 
18
16
  mkmf.log
19
17
  .rakeTasks
@@ -0,0 +1,14 @@
1
+ FROM golang:1.14-alpine
2
+ RUN apk add --no-cache git build-base tzdata
3
+
4
+ RUN mkdir -p /app
5
+ WORKDIR /app
6
+ ADD ./go.mod /app
7
+ ADD ./go.sum /app
8
+ ADD ./ /app
9
+ RUN go get
10
+
11
+
12
+ ENV PORT 3000
13
+ RUN go build -o server qnetwork.net/fbcrawl
14
+ ENTRYPOINT ["./server"]
@@ -1,19 +1,21 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- fbcrawl-colly (0.2.2)
5
- ffi
4
+ fbcrawl-colly (1.0.1)
6
5
  google-protobuf
6
+ grpc
7
7
 
8
8
  GEM
9
9
  remote: https://rubygems.org/
10
10
  specs:
11
- ffi (1.13.1)
12
- google-protobuf (3.12.4)
11
+ google-protobuf (3.13.0)
12
+ googleapis-common-protos-types (1.0.5)
13
+ google-protobuf (~> 3.11)
14
+ grpc (1.30.2)
15
+ google-protobuf (~> 3.12)
16
+ googleapis-common-protos-types (~> 1.0)
13
17
  minitest (5.14.1)
14
18
  rake (12.3.3)
15
- rake-compiler (1.1.1)
16
- rake
17
19
 
18
20
  PLATFORMS
19
21
  ruby
@@ -22,7 +24,6 @@ DEPENDENCIES
22
24
  fbcrawl-colly!
23
25
  minitest (~> 5.0)
24
26
  rake (~> 12.0)
25
- rake-compiler
26
27
 
27
28
  BUNDLED WITH
28
29
  2.1.4
data/README.md CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  This project is to crawl mbasic.facebook.com using GO Colly.
4
4
 
5
- Ruby gem will available to use
5
+ Ruby gem is only client for GRPC colly service
6
6
 
7
7
  ## Installation
8
8
 
@@ -25,12 +25,8 @@ Gem::Specification.new do |spec|
25
25
  end
26
26
  # spec.bindir = "exe"
27
27
  # spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
28
- spec.extensions = [
29
- 'ext/fbcrawl_colly/extconf.rb'
30
- ]
31
- spec.require_paths = ["lib"]
28
+ spec.require_paths = %w[lib lib/pb]
32
29
 
33
- spec.add_runtime_dependency 'ffi'
34
30
  spec.add_runtime_dependency 'google-protobuf'
35
- spec.add_development_dependency 'rake-compiler'
31
+ spec.add_runtime_dependency 'grpc'
36
32
  end
@@ -1,17 +1,92 @@
1
1
  syntax = "proto3";
2
2
 
3
3
  package fbcrawl_colly;
4
- option go_package = "./fbcrawl;fbcrawl";
4
+ option go_package = "./fbcrawl/pb;pb";
5
+
6
+ service Grpc {
7
+ // Sends a greeting
8
+ rpc Init (Empty) returns (Pointer) {}
9
+ rpc FreeColly (Pointer) returns (Empty) {}
10
+ rpc Login (LoginRequest) returns (LoginResponse) {}
11
+ rpc LoginWithCookies (LoginWithCookiesRequest) returns (Empty) {}
12
+ rpc FetchGroupInfo (FetchGroupInfoRequest) returns (FacebookGroup) {}
13
+ rpc FetchUserInfo (FetchUserInfoRequest) returns (FacebookUser) {}
14
+ rpc FetchGroupFeed (FetchGroupFeedRequest) returns (FacebookPostList) {}
15
+ rpc FetchPost (FetchPostRequest) returns (FacebookPost) {}
16
+ rpc FetchContentImages (FetchContentImagesRequest) returns (FacebookImageList) {}
17
+ rpc FetchImageUrl (FetchImageUrlRequest) returns (FacebookImage) {}
18
+ }
19
+
20
+ message Empty {
21
+
22
+ }
23
+
24
+ message Pointer {
25
+ int64 address = 1;
26
+ }
27
+
28
+ message LoginRequest {
29
+ Pointer pointer = 1;
30
+ string email = 2;
31
+ string password = 3;
32
+ string totp_secret = 4;
33
+ }
34
+
35
+ message LoginResponse {
36
+ string cookies = 1;
37
+ }
38
+
39
+ message LoginWithCookiesRequest {
40
+ Pointer pointer = 1;
41
+ string cookies = 2;
42
+ }
43
+
44
+ message FetchGroupInfoRequest {
45
+ Pointer pointer = 1;
46
+ string group_username = 2;
47
+ }
48
+
49
+ message FetchUserInfoRequest {
50
+ Pointer pointer = 1;
51
+ string username = 2;
52
+ }
53
+
54
+ message FetchGroupFeedRequest {
55
+ Pointer pointer = 1;
56
+ int64 group_id = 2;
57
+ string next_cursor = 3;
58
+ }
59
+
60
+ message FetchPostRequest {
61
+ Pointer pointer = 1;
62
+ int64 group_id = 2;
63
+ int64 post_id = 3;
64
+ string comment_next_cursor = 4;
65
+ }
66
+
67
+ message FetchContentImagesRequest {
68
+ Pointer pointer = 1;
69
+ int64 post_id = 2;
70
+ string next_cursor = 3;
71
+ }
72
+
73
+ message FetchImageUrlRequest {
74
+ Pointer pointer = 1;
75
+ int64 image_id = 2;
76
+ }
5
77
 
6
78
  // The request message containing the user's name.
7
79
  message FacebookGroup {
8
80
  int64 id = 1;
9
81
  string name = 2;
82
+ int64 member_count = 3;
10
83
  }
11
84
 
12
85
  message FacebookUser {
13
86
  int64 id = 1;
14
87
  string name = 2;
88
+ string username = 3;
89
+ int64 friend_count =4;
15
90
  }
16
91
 
17
92
  message FacebookPost {
@@ -19,12 +94,18 @@ message FacebookPost {
19
94
  FacebookGroup group = 2;
20
95
  FacebookUser user = 3;
21
96
  string content = 4;
97
+ CommentList comments = 5;
22
98
  string content_link = 6;
23
- FacebookImage content_image = 8;
24
99
  repeated FacebookImage content_images = 7;
25
- repeated FacebookComment comments = 5;
100
+ FacebookImage content_image = 8;
26
101
  int64 created_at = 9;
27
102
  int64 reaction_count = 10;
103
+ int64 comment_count = 11;
104
+ }
105
+
106
+ message CommentList {
107
+ repeated FacebookComment comments = 5;
108
+ string next_cursor = 12;
28
109
  }
29
110
 
30
111
  message FacebookImage {
@@ -42,8 +123,10 @@ message FacebookComment {
42
123
 
43
124
  message FacebookPostList {
44
125
  repeated FacebookPost posts = 1;
126
+ string next_cursor = 2;
45
127
  }
46
128
 
47
129
  message FacebookImageList {
48
130
  repeated FacebookImage images = 1;
131
+ string next_cursor = 2;
49
132
  }
@@ -6,16 +6,18 @@ import (
6
6
  "errors"
7
7
  "fmt"
8
8
  "github.com/PuerkitoBio/goquery"
9
- "github.com/gocolly/colly"
10
- "github.com/gocolly/colly/extensions"
11
- "github.com/gocolly/colly/storage"
9
+ "github.com/gocolly/colly/v2"
10
+ "github.com/gocolly/colly/v2/debug"
11
+ "github.com/gocolly/colly/v2/extensions"
12
+ "github.com/gocolly/colly/v2/storage"
12
13
  "github.com/google/logger"
13
14
  "github.com/olebedev/when"
14
15
  "github.com/olebedev/when/rules/common"
15
16
  "github.com/olebedev/when/rules/en"
16
17
  "github.com/thoas/go-funk"
18
+ "github.com/xlzd/gotp"
17
19
  "net/url"
18
- "qnetwork.net/fbcrawl/fbcrawl"
20
+ "qnetwork.net/fbcrawl/fbcrawl/pb"
19
21
  "regexp"
20
22
  "strconv"
21
23
  "strings"
@@ -33,7 +35,7 @@ type FbDataInsight struct {
33
35
  FbDataPostContext `json:"post_context"`
34
36
  }
35
37
  type FbDataFt struct {
36
- ContentOwnerIdNew int64 `json:"content_owner_id_new"`
38
+ ContentOwnerIdNew json.Number `json:"content_owner_id_new"`
37
39
  PhotoAttachmentsList []string `json:"photo_attachments_list"`
38
40
  PhotoId int64 `json:"photo_id,string"`
39
41
  PageId int64 `json:"page_id,string"`
@@ -42,7 +44,7 @@ type FbDataFt struct {
42
44
  }
43
45
 
44
46
  func sharedOnRequest(request *colly.Request) {
45
- logger.Info("OnRequest")
47
+ logger.Info("OnRequest ", request.URL)
46
48
  //request.Headers.Set("Host", "facebook.com")
47
49
  request.Headers.Set("Accept-Language", "en-US,en;q=0.9")
48
50
  request.Headers.Set("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
@@ -65,6 +67,7 @@ func setupSharedCollector(collector *colly.Collector) error {
65
67
  collector.AllowURLRevisit = true
66
68
  collector.OnRequest(sharedOnRequest)
67
69
  collector.OnResponse(sharedOnResponse)
70
+ collector.SetDebugger(&debug.LogDebugger{})
68
71
  collector.OnError(func(resp *colly.Response, errHttp error) {
69
72
  err = errHttp
70
73
  logger.Error("OnError", err)
@@ -106,26 +109,29 @@ func New() *Fbcolly {
106
109
  return &f
107
110
  }
108
111
 
109
- func (f *Fbcolly) Login(email string, password string, otp string) (string, error) {
112
+ func (f *Fbcolly) Login(email string, password string, totpSecret string) (string, error) {
110
113
  collector := f.collector.Clone()
111
114
  err := setupSharedCollector(collector)
112
115
 
113
116
  logger.Info("Login using email", email)
114
117
  loggedIn := false
115
-
118
+ firstLogin := true
116
119
  collector.OnHTML("#login_form", func(element *colly.HTMLElement) {
117
- logger.Info("OnHTML login_form")
118
- loginURL, err, reqMap := getForm(element, err)
119
- if err != nil {
120
- logger.Error(err)
121
- return
122
- }
123
- reqMap["email"] = email
124
- reqMap["pass"] = password
125
- logger.Info("req map:", reqMap)
126
- err = collector.Post(loginURL, reqMap)
127
- if err != nil {
128
- logger.Error("post err:", err)
120
+ if firstLogin {
121
+ firstLogin = false
122
+ logger.Info("OnHTML login_form")
123
+ loginURL, err, reqMap := getForm(element, err)
124
+ if err != nil {
125
+ logger.Error(err)
126
+ return
127
+ }
128
+ reqMap["email"] = email
129
+ reqMap["pass"] = password
130
+ logger.Info("req map:", reqMap)
131
+ err = collector.Post(loginURL, reqMap)
132
+ if err != nil {
133
+ logger.Error("post err:", err)
134
+ }
129
135
  }
130
136
  })
131
137
 
@@ -152,9 +158,12 @@ func (f *Fbcolly) Login(email string, password string, otp string) (string, erro
152
158
  //logger.Info("Please input OTP")
153
159
  //reader := bufio.NewReader(os.Stdin)
154
160
  //code, _ := reader.ReadString('\n')
155
- code := otp[0:6]
156
- reqMap["approvals_code"] = code
157
- shouldSubmit = true
161
+ if len(totpSecret) > 0 {
162
+ code := gotp.NewDefaultTOTP(totpSecret).Now()
163
+ reqMap["approvals_code"] = code
164
+ shouldSubmit = true
165
+ }
166
+
158
167
  } else {
159
168
  logger.Info("OnHTML Only Continue checkpoint")
160
169
 
@@ -188,68 +197,152 @@ func (f *Fbcolly) Login(email string, password string, otp string) (string, erro
188
197
 
189
198
  }
190
199
 
191
- func (f *Fbcolly) FetchGroupFeed(groupId int64) (error, *fbcrawl.FacebookPostList) {
200
+ func (f *Fbcolly) FetchGroupFeed(groupId int64, nextCursor string) (error, *pb.FacebookPostList) {
192
201
  collector := f.collector.Clone()
193
202
  err := setupSharedCollector(collector)
194
- currentPage := 1
195
- var result []*fbcrawl.FacebookPost
203
+ result := pb.FacebookPostList{Posts: []*pb.FacebookPost{}}
196
204
 
197
205
  collector.OnHTML("#m_group_stories_container > :last-child a", func(element *colly.HTMLElement) {
198
- currentPage++
199
- if currentPage < 3 {
200
- logger.Info("Will fetch page", currentPage)
201
- err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
206
+ result.NextCursor = "http://mbasic.facebook.com" + element.Attr("href")
207
+ })
208
+ collector.OnHTML("#m_group_stories_container div[role=\"article\"]", func(element *colly.HTMLElement) {
209
+ dataElement := element
210
+ post := &pb.FacebookPost{}
211
+ var fbDataFt FbDataFt
212
+ jsonData := dataElement.Attr("data-ft")
213
+
214
+ logger.Info(jsonData)
215
+ err = json.Unmarshal([]byte(jsonData), &fbDataFt)
216
+ if err != nil {
217
+ logger.Error(err)
218
+ return
219
+ }
220
+ logger.Info("Post ", fbDataFt)
221
+ post.Id = fbDataFt.TopLevelPostId
222
+ post.Group = &pb.FacebookGroup{Id: fbDataFt.PageId, Name: dataElement.DOM.Find("h3 strong:nth-child(2) a").Text()}
223
+ userId, _ := fbDataFt.ContentOwnerIdNew.Int64()
224
+ post.User = &pb.FacebookUser{
225
+ Id: userId,
226
+ Name: dataElement.DOM.Find("h3 strong:nth-child(1) a").Text(),
202
227
  }
228
+ post.CreatedAt = fbDataFt.PageInsights[strconv.FormatInt(fbDataFt.PageId, 10)].PublishTime
229
+ //Content
230
+
231
+ //NO BACKGROUND TEXT ONLY
232
+ post.Content = strings.Join(dataElement.DOM.Find("p").Map(func(i int, selection *goquery.Selection) string {
233
+ return selection.Text()
234
+ }), "\n")
235
+
236
+ if len(post.Content) == 0 {
237
+ // TEXT WITH BACKGROUND
238
+ post.Content = dataElement.DOM.Find("div[style*=\"background-image:url\"]").Text()
239
+ }
240
+
241
+ post.ContentLink = getUrlFromRedirectHref(dataElement.DOM.Find("a[href*=\"https://lm.facebook.com/l.php\"]").AttrOr("href", ""))
242
+ post.ReactionCount = getNumberFromText(element.DOM.Find("span[id*=\"like_\"]").Text())
243
+ post.CommentCount = getNumberFromText(element.DOM.Find("span[id*=\"like_\"] ~ a").Text())
244
+ post.ContentImages = (funk.Map(fbDataFt.PhotoAttachmentsList, func(id string) *pb.FacebookImage {
245
+ i, _ := strconv.ParseInt(id, 10, 64)
246
+ return &pb.FacebookImage{
247
+ Id: i,
248
+ }
249
+ })).([]*pb.FacebookImage)
250
+
251
+ if fbDataFt.PhotoId > 0 {
252
+ post.ContentImage = &pb.FacebookImage{Id: fbDataFt.PhotoId}
253
+ }
254
+ result.Posts = append(result.Posts, post)
203
255
  })
256
+ if len(nextCursor) > 0 {
257
+ err = collector.Visit(nextCursor)
258
+ } else {
259
+ err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/groups/%d", groupId))
260
+ }
204
261
 
205
- collector.OnXML("//a[text()=\"Full Story\"]", func(element *colly.XMLElement) {
206
- logger.Info("Post found at", element.Attr("href"))
207
- u, _ := url.Parse("http://mbasic.facebook.com" + element.Attr("href"))
208
- postId, _ := strconv.ParseInt(u.Query().Get("id"), 10, 64)
209
- result = append(result, &fbcrawl.FacebookPost{
210
- Id: postId,
211
- Group: &fbcrawl.FacebookGroup{Id: groupId},
212
- })
213
- //f.detailCollector.Visit(url)
262
+ if err != nil {
263
+ logger.Error("crawl by colly err:", err)
264
+ }
265
+ return err, &result
266
+ }
267
+
268
+ func (f *Fbcolly) FetchUserInfo(userIdOrUsername string) (error, *pb.FacebookUser) {
269
+ collector := f.collector.Clone()
270
+ err := setupSharedCollector(collector)
271
+
272
+ result := &pb.FacebookUser{}
273
+
274
+ collector.OnHTML("a[href*=\"lst=\"]", func(element *colly.HTMLElement) {
275
+ parsed, _ := url.Parse(element.Attr("href"))
276
+ result.Username = strings.Split(parsed.Path[1:], "/")[0]
277
+ result.Id = getUserIdFromCommentHref(parsed.Query().Get("lst"))
278
+ })
279
+
280
+ collector.OnHTML("a[href*=\"/friends\"]", func(element *colly.HTMLElement) {
281
+ result.FriendCount = getNumberFromText(element.Text)
282
+ })
283
+
284
+ collector.OnHTML("#objects_container", func(element *colly.HTMLElement) {
285
+ result.Name = element.DOM.Find("strong").First().Text()
286
+ })
287
+
288
+ err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/%s", userIdOrUsername))
289
+ if err != nil {
290
+ logger.Error("crawl by colly err:", err)
291
+ }
292
+ return err, result
293
+ }
294
+
295
+ func (f *Fbcolly) FetchGroupInfo(groupIdOrUsername string) (error, *pb.FacebookGroup) {
296
+ collector := f.collector.Clone()
297
+ err := setupSharedCollector(collector)
298
+ result := &pb.FacebookGroup{}
299
+
300
+ collector.OnHTML("a[href=\"#groupMenuBottom\"] h1", func(element *colly.HTMLElement) {
301
+ result.Name = element.Text
302
+ })
303
+ collector.OnHTML("a[href*=\"view=member\"]", func(element *colly.HTMLElement) {
304
+ result.Id = getNumberFromText(element.Attr("href"))
305
+ result.MemberCount, _ = strconv.ParseInt(element.DOM.Closest("tr").Find("td:last-child").Text(), 10, 64)
214
306
  })
215
307
 
216
- err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/groups/%d", groupId))
308
+ err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/groups/%s?view=info", groupIdOrUsername))
217
309
  if err != nil {
218
310
  logger.Error("crawl by colly err:", err)
219
311
  }
220
- return err, &fbcrawl.FacebookPostList{Posts: result}
312
+ return err, result
221
313
  }
222
314
 
223
- func (f *Fbcolly) FetchContentImages(postId int64) (error, *fbcrawl.FacebookImageList) {
315
+ func (f *Fbcolly) FetchContentImages(postId int64, nextCursor string) (error, *pb.FacebookImageList) {
224
316
  collector := f.collector.Clone()
225
317
  err := setupSharedCollector(collector)
226
- currentPage := 1
227
- var result []*fbcrawl.FacebookImage
318
+ result := pb.FacebookImageList{Images: []*pb.FacebookImage{}}
228
319
 
229
320
  collector.OnHTML("a[href*=\"/media/set/\"]", func(element *colly.HTMLElement) {
230
- currentPage++
231
- logger.Info("Will fetch page", currentPage)
232
- err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
321
+ result.NextCursor = "http://mbasic.facebook.com" + element.Attr("href")
233
322
  })
234
323
 
235
324
  collector.OnHTML("a[href*=\"/photo.php\"]", func(element *colly.HTMLElement) {
236
- result = append(result, &fbcrawl.FacebookImage{
325
+ result.Images = append(result.Images, &pb.FacebookImage{
237
326
  Id: getImageIdFromHref(element.Attr("href")),
238
327
  })
239
328
  //f.detailCollector.Visit(url)
240
329
  })
330
+ if len(nextCursor) > 0 {
331
+ err = collector.Visit(nextCursor)
332
+ } else {
333
+ err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/media/set/?set=pcb.%d", postId))
334
+ }
241
335
 
242
- err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/media/set/?set=pcb.%d", postId))
243
336
  if err != nil {
244
337
  logger.Error("crawl by colly err:", err)
245
338
  }
246
- return err, &fbcrawl.FacebookImageList{Images: result}
339
+ return err, &result
247
340
  }
248
341
 
249
- func (f *Fbcolly) FetchImageUrl(imageId int64) (error, *fbcrawl.FacebookImage) {
342
+ func (f *Fbcolly) FetchImageUrl(imageId int64) (error, *pb.FacebookImage) {
250
343
  collector := f.collector.Clone()
251
344
  err := setupSharedCollector(collector)
252
- result := fbcrawl.FacebookImage{Id: imageId}
345
+ result := pb.FacebookImage{Id: imageId}
253
346
 
254
347
  collector.OnHTML("a", func(element *colly.HTMLElement) {
255
348
  result.Url = element.Attr("href")
@@ -262,11 +355,10 @@ func (f *Fbcolly) FetchImageUrl(imageId int64) (error, *fbcrawl.FacebookImage) {
262
355
  return err, &result
263
356
  }
264
357
 
265
- func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.FacebookPost) {
358
+ func (f *Fbcolly) FetchPost(groupId int64, postId int64, commentNextCursor string) (error, *pb.FacebookPost) {
266
359
  collector := f.collector.Clone()
267
360
  err := setupSharedCollector(collector)
268
- post := &fbcrawl.FacebookPost{Comments: []*fbcrawl.FacebookComment{}}
269
- commentPaging := 0
361
+ post := &pb.FacebookPost{Comments: &pb.CommentList{Comments: []*pb.FacebookComment{}}}
270
362
  collector.OnHTML("#m_story_permalink_view", func(element *colly.HTMLElement) {
271
363
  dataElement := element.DOM.Find("div[data-ft]")
272
364
  if dataElement.Length() > 0 {
@@ -281,9 +373,10 @@ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.Facebo
281
373
  }
282
374
  logger.Info("Post ", result)
283
375
  post.Id = result.TopLevelPostId
284
- post.Group = &fbcrawl.FacebookGroup{Id: result.PageId, Name: dataElement.Find("h3 strong:last-child a").Text()}
285
- post.User = &fbcrawl.FacebookUser{
286
- Id: result.ContentOwnerIdNew,
376
+ post.Group = &pb.FacebookGroup{Id: result.PageId, Name: dataElement.Find("h3 strong:last-child a").Text()}
377
+ userId, _ := result.ContentOwnerIdNew.Int64()
378
+ post.User = &pb.FacebookUser{
379
+ Id: userId,
287
380
  Name: dataElement.Find("h3 strong:first-child a").Text(),
288
381
  }
289
382
  post.CreatedAt = result.PageInsights[strconv.FormatInt(result.PageId, 10)].PublishTime
@@ -300,21 +393,17 @@ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.Facebo
300
393
  }
301
394
 
302
395
  post.ContentLink = getUrlFromRedirectHref(dataElement.Find("a[href*=\"https://lm.facebook.com/l.php\"]").AttrOr("href", ""))
303
- post.ReactionCount = getReactionFromText(element.DOM.Find("div[id*=\"sentence_\"]").Text())
304
- post.ContentImages = (funk.Map(result.PhotoAttachmentsList, func(id string) *fbcrawl.FacebookImage {
396
+ post.ReactionCount = getNumberFromText(element.DOM.Find("div[id*=\"sentence_\"]").Text())
397
+ post.ContentImages = (funk.Map(result.PhotoAttachmentsList, func(id string) *pb.FacebookImage {
305
398
  i, _ := strconv.ParseInt(id, 10, 64)
306
- return &fbcrawl.FacebookImage{
399
+ return &pb.FacebookImage{
307
400
  Id: i,
308
401
  }
309
- })).([]*fbcrawl.FacebookImage)
402
+ })).([]*pb.FacebookImage)
310
403
 
311
404
  if result.PhotoId > 0 {
312
- post.ContentImage = &fbcrawl.FacebookImage{Id: result.PhotoId}
405
+ post.ContentImage = &pb.FacebookImage{Id: result.PhotoId}
313
406
  }
314
-
315
- logger.Info("content", strings.Join(dataElement.Find("p").Map(func(i int, selection *goquery.Selection) string {
316
- return selection.Text()
317
- }), "\n"))
318
407
  }
319
408
 
320
409
  //Comment
@@ -323,12 +412,13 @@ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.Facebo
323
412
  commentId, _ := strconv.ParseInt(selection.AttrOr("id", ""), 10, 64)
324
413
  logger.Info("comment", commentId)
325
414
  createdAtWhenResult, _ := f.w.Parse(selection.Find("abbr").Text(), time.Now())
326
- post.Comments = append(post.Comments, &fbcrawl.FacebookComment{
415
+ parsed, _ := url.Parse(selection.Find("h3 > a").AttrOr("href", ""))
416
+ post.Comments.Comments = append(post.Comments.Comments, &pb.FacebookComment{
327
417
  Id: commentId,
328
- Post: &fbcrawl.FacebookPost{Id: post.Id},
329
- User: &fbcrawl.FacebookUser{
330
- Id: getUserIdFromCommentHref(selection.Find("a[href*=\"#comment_form_\"]").AttrOr("href", "")),
331
- Name: selection.Find("h3 > a").Text(),
418
+ Post: &pb.FacebookPost{Id: post.Id},
419
+ User: &pb.FacebookUser{
420
+ Username: parsed.Path[1:],
421
+ Name: selection.Find("h3 > a").Text(),
332
422
  },
333
423
  Content: selection.Find("h3 + div").Text(),
334
424
  CreatedAt: createdAtWhenResult.Time.Unix(),
@@ -339,14 +429,14 @@ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.Facebo
339
429
  })
340
430
 
341
431
  collector.OnHTML("div[id*=\"see_prev_\"] > a", func(element *colly.HTMLElement) {
342
- if commentPaging < 3 {
343
- logger.Info("Comment paging", commentPaging)
344
- err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
345
- commentPaging = commentPaging + 1
346
- }
432
+ post.Comments.NextCursor = "http://mbasic.facebook.com" + element.Attr("href")
347
433
  })
434
+ if len(commentNextCursor) > 0 {
435
+ err = collector.Visit(commentNextCursor)
436
+ } else {
437
+ err = collector.Visit(fmt.Sprintf("http://mbasic.facebook.com/groups/%d?view=permalink&id=%d&_rdr", groupId, postId))
438
+ }
348
439
 
349
- err = collector.Visit(fmt.Sprintf("http://mbasic.facebook.com/groups/%d?view=permalink&id=%d&_rdr", groupId, postId))
350
440
  return err, post
351
441
  }
352
442
 
@@ -360,8 +450,12 @@ func (f *Fbcolly) LoginWithCookies(cookies string) error {
360
450
  //}
361
451
 
362
452
  func getUserIdFromCommentHref(href string) int64 {
363
- id, _ := strconv.ParseInt(regexp.MustCompile("#comment_form_(\\d+)").FindStringSubmatch(href)[1], 10, 64)
364
- return id
453
+ match := regexp.MustCompile("\\d+:(\\d+)\\d+").FindStringSubmatch(href)
454
+ if len(match) > 0 {
455
+ id, _ := strconv.ParseInt(match[1], 10, 64)
456
+ return id
457
+ }
458
+ return 0
365
459
  }
366
460
 
367
461
  func getUrlFromRedirectHref(href string) string {
@@ -375,11 +469,20 @@ func getImageIdFromHref(href string) int64 {
375
469
  return i
376
470
  }
377
471
 
378
- func getReactionFromText(text string) int64 {
472
+ func getNumberFromText(text string) int64 {
379
473
  logger.Error("reaction", text)
380
474
  if len(text) > 0 {
381
- id, _ := strconv.ParseInt(regexp.MustCompile("(\\d+)").FindStringSubmatch(text)[1], 10, 64)
382
- return id
475
+ match := regexp.MustCompile("(\\d+)\\s?([km]?)").FindStringSubmatch(text)
476
+ if len(match) > 0 {
477
+ count, _ := strconv.ParseInt(match[1], 10, 64)
478
+ switch match[2] {
479
+ case "k":
480
+ count *= 1000
481
+ case "m":
482
+ count *= 1000000
483
+ }
484
+ return count
485
+ }
383
486
  }
384
487
  return 0
385
488
  }