fbcrawl-colly 0.2.1 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5a3dd903339ecec17525ca24b86324664458ef884f70065f611dc5bc15caf0a6
4
- data.tar.gz: 7d7fbfd082240d21fc66f100eb530f81579b62c715181a10b95b1c70f5ad1d98
3
+ metadata.gz: 00d52fcc50bd651adaa80cf6102d7cc3997a07605ebb91fd33006b6e535ed6fc
4
+ data.tar.gz: 24388fa7c98d6dfbe58ba195c330ea3c363f6ef8e586873dd9e485ad29907a6c
5
5
  SHA512:
6
- metadata.gz: c5a3b0dcd89eb3b94adcc797785984e04a122a51f9135a37aed10aee0607dd357fe99ebd69e8f2eb0ebeaf05ff4c1f0bf21c4713d173065f4782dbeb192c365e
7
- data.tar.gz: d01e881175776aa5039859b1b11d723c3a577a4a68f54b6bb550d0c8c90c87d89004a94b05f21846763d56687d65bc9132e602f61a94fcebb25e934dca93ab40
6
+ metadata.gz: 15622279b09030f929218482add87878b58fdcb1e89e1ab8a1124531435707dbe26367fee75204a55fc88f8486c47cbb6ddfdb86035aa24694b1b7d6639aaecc
7
+ data.tar.gz: df50b30ad0234c824505cca54fdf33d0be54c4725c684f0317b1eba81f89d22d7b434bec4e9373b3d43dd830dc17693ac2fb785e75cefc71c6cbeefc44da2616
data/.gitignore CHANGED
@@ -12,8 +12,6 @@
12
12
  /parse.log
13
13
  last.html
14
14
  *.db
15
- /fbcrawl/fbcrawl.pb.go
16
- /lib/fbcrawl_pb.rb
17
15
 
18
16
  mkmf.log
19
17
  .rakeTasks
@@ -0,0 +1,14 @@
1
+ FROM golang:1.14-alpine
2
+ RUN apk add --no-cache git build-base tzdata
3
+
4
+ RUN mkdir -p /app
5
+ WORKDIR /app
6
+ ADD ./go.mod /app
7
+ ADD ./go.sum /app
8
+ ADD ./ /app
9
+ RUN go get
10
+
11
+
12
+ ENV PORT 3000
13
+ RUN go build -o server qnetwork.net/fbcrawl
14
+ ENTRYPOINT ["./server"]
@@ -1,19 +1,21 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- fbcrawl-colly (0.2.1)
5
- ffi
4
+ fbcrawl-colly (1.0.0)
6
5
  google-protobuf
6
+ grpc
7
7
 
8
8
  GEM
9
9
  remote: https://rubygems.org/
10
10
  specs:
11
- ffi (1.13.1)
12
- google-protobuf (3.12.4)
11
+ google-protobuf (3.13.0-universal-darwin)
12
+ googleapis-common-protos-types (1.0.5)
13
+ google-protobuf (~> 3.11)
14
+ grpc (1.30.2-universal-darwin)
15
+ google-protobuf (~> 3.12)
16
+ googleapis-common-protos-types (~> 1.0)
13
17
  minitest (5.14.1)
14
18
  rake (12.3.3)
15
- rake-compiler (1.1.1)
16
- rake
17
19
 
18
20
  PLATFORMS
19
21
  ruby
@@ -22,7 +24,6 @@ DEPENDENCIES
22
24
  fbcrawl-colly!
23
25
  minitest (~> 5.0)
24
26
  rake (~> 12.0)
25
- rake-compiler
26
27
 
27
28
  BUNDLED WITH
28
29
  2.1.4
data/README.md CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  This project is to crawl mbasic.facebook.com using GO Colly.
4
4
 
5
- Ruby gem will available to use
5
+ Ruby gem is only client for GRPC colly service
6
6
 
7
7
  ## Installation
8
8
 
@@ -25,12 +25,8 @@ Gem::Specification.new do |spec|
25
25
  end
26
26
  # spec.bindir = "exe"
27
27
  # spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
28
- spec.extensions = [
29
- 'ext/fbcrawl_colly/extconf.rb'
30
- ]
31
- spec.require_paths = ["lib"]
28
+ spec.require_paths = %w[lib lib/pb]
32
29
 
33
- spec.add_runtime_dependency 'ffi'
34
30
  spec.add_runtime_dependency 'google-protobuf'
35
- spec.add_development_dependency 'rake-compiler'
31
+ spec.add_runtime_dependency 'grpc'
36
32
  end
@@ -1,12 +1,79 @@
1
1
  syntax = "proto3";
2
2
 
3
3
  package fbcrawl_colly;
4
- option go_package = "./fbcrawl;fbcrawl";
4
+ option go_package = "./fbcrawl/pb;pb";
5
+
6
+ service Grpc {
7
+ // Sends a greeting
8
+ rpc Init (Empty) returns (Pointer) {}
9
+ rpc FreeColly (Pointer) returns (Empty) {}
10
+ rpc Login (LoginRequest) returns (LoginResponse) {}
11
+ rpc LoginWithCookies (LoginWithCookiesRequest) returns (Empty) {}
12
+ rpc FetchGroupInfo (FetchGroupInfoRequest) returns (FacebookGroup) {}
13
+ rpc FetchGroupFeed (FetchGroupFeedRequest) returns (FacebookPostList) {}
14
+ rpc FetchPost (FetchPostRequest) returns (FacebookPost) {}
15
+ rpc FetchContentImages (FetchContentImagesRequest) returns (FacebookImageList) {}
16
+ rpc FetchImageUrl (FetchImageUrlRequest) returns (FacebookImage) {}
17
+ }
18
+
19
+ message Empty {
20
+
21
+ }
22
+
23
+ message Pointer {
24
+ int64 address = 1;
25
+ }
26
+
27
+ message LoginRequest {
28
+ Pointer pointer = 1;
29
+ string email = 2;
30
+ string password = 3;
31
+ string totp_secret = 4;
32
+ }
33
+
34
+ message LoginResponse {
35
+ string cookies = 1;
36
+ }
37
+
38
+ message LoginWithCookiesRequest {
39
+ Pointer pointer = 1;
40
+ string cookies = 2;
41
+ }
42
+
43
+ message FetchGroupInfoRequest {
44
+ Pointer pointer = 1;
45
+ string group_username = 2;
46
+ }
47
+
48
+ message FetchGroupFeedRequest {
49
+ Pointer pointer = 1;
50
+ int64 group_id = 2;
51
+ string next_cursor = 3;
52
+ }
53
+
54
+ message FetchPostRequest {
55
+ Pointer pointer = 1;
56
+ int64 group_id = 2;
57
+ int64 post_id = 3;
58
+ string comment_next_cursor = 4;
59
+ }
60
+
61
+ message FetchContentImagesRequest {
62
+ Pointer pointer = 1;
63
+ int64 post_id = 2;
64
+ string next_cursor = 3;
65
+ }
66
+
67
+ message FetchImageUrlRequest {
68
+ Pointer pointer = 1;
69
+ int64 image_id = 2;
70
+ }
5
71
 
6
72
  // The request message containing the user's name.
7
73
  message FacebookGroup {
8
74
  int64 id = 1;
9
75
  string name = 2;
76
+ int64 member_count = 3;
10
77
  }
11
78
 
12
79
  message FacebookUser {
@@ -19,12 +86,18 @@ message FacebookPost {
19
86
  FacebookGroup group = 2;
20
87
  FacebookUser user = 3;
21
88
  string content = 4;
89
+ CommentList comments = 5;
22
90
  string content_link = 6;
23
- FacebookImage content_image = 8;
24
91
  repeated FacebookImage content_images = 7;
25
- repeated FacebookComment comments = 5;
92
+ FacebookImage content_image = 8;
26
93
  int64 created_at = 9;
27
94
  int64 reaction_count = 10;
95
+ int64 comment_count = 11;
96
+ }
97
+
98
+ message CommentList {
99
+ repeated FacebookComment comments = 5;
100
+ string next_cursor = 12;
28
101
  }
29
102
 
30
103
  message FacebookImage {
@@ -42,8 +115,10 @@ message FacebookComment {
42
115
 
43
116
  message FacebookPostList {
44
117
  repeated FacebookPost posts = 1;
118
+ string next_cursor = 2;
45
119
  }
46
120
 
47
121
  message FacebookImageList {
48
122
  repeated FacebookImage images = 1;
123
+ string next_cursor = 2;
49
124
  }
@@ -6,16 +6,18 @@ import (
6
6
  "errors"
7
7
  "fmt"
8
8
  "github.com/PuerkitoBio/goquery"
9
- "github.com/gocolly/colly"
10
- "github.com/gocolly/colly/extensions"
11
- "github.com/gocolly/colly/storage"
9
+ "github.com/gocolly/colly/v2"
10
+ "github.com/gocolly/colly/v2/debug"
11
+ "github.com/gocolly/colly/v2/extensions"
12
+ "github.com/gocolly/colly/v2/storage"
12
13
  "github.com/google/logger"
13
14
  "github.com/olebedev/when"
14
15
  "github.com/olebedev/when/rules/common"
15
16
  "github.com/olebedev/when/rules/en"
16
17
  "github.com/thoas/go-funk"
18
+ "github.com/xlzd/gotp"
17
19
  "net/url"
18
- "qnetwork.net/fbcrawl/fbcrawl"
20
+ "qnetwork.net/fbcrawl/fbcrawl/pb"
19
21
  "regexp"
20
22
  "strconv"
21
23
  "strings"
@@ -42,7 +44,7 @@ type FbDataFt struct {
42
44
  }
43
45
 
44
46
  func sharedOnRequest(request *colly.Request) {
45
- logger.Info("OnRequest")
47
+ logger.Info("OnRequest ", request.URL)
46
48
  //request.Headers.Set("Host", "facebook.com")
47
49
  request.Headers.Set("Accept-Language", "en-US,en;q=0.9")
48
50
  request.Headers.Set("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
@@ -62,9 +64,10 @@ func sharedOnRequest(request *colly.Request) {
62
64
  func setupSharedCollector(collector *colly.Collector) error {
63
65
  var err error
64
66
  extensions.Referer(collector)
65
-
67
+ collector.AllowURLRevisit = true
66
68
  collector.OnRequest(sharedOnRequest)
67
69
  collector.OnResponse(sharedOnResponse)
70
+ collector.SetDebugger(&debug.LogDebugger{})
68
71
  collector.OnError(func(resp *colly.Response, errHttp error) {
69
72
  err = errHttp
70
73
  logger.Error("OnError", err)
@@ -106,26 +109,29 @@ func New() *Fbcolly {
106
109
  return &f
107
110
  }
108
111
 
109
- func (f *Fbcolly) Login(email string, password string, otp string) (string, error) {
112
+ func (f *Fbcolly) Login(email string, password string, totpSecret string) (string, error) {
110
113
  collector := f.collector.Clone()
111
114
  err := setupSharedCollector(collector)
112
115
 
113
116
  logger.Info("Login using email", email)
114
117
  loggedIn := false
115
-
118
+ firstLogin := true
116
119
  collector.OnHTML("#login_form", func(element *colly.HTMLElement) {
117
- logger.Info("OnHTML login_form")
118
- loginURL, err, reqMap := getForm(element, err)
119
- if err != nil {
120
- logger.Error(err)
121
- return
122
- }
123
- reqMap["email"] = email
124
- reqMap["pass"] = password
125
- logger.Info("req map:", reqMap)
126
- err = collector.Post(loginURL, reqMap)
127
- if err != nil {
128
- logger.Error("post err:", err)
120
+ if firstLogin {
121
+ firstLogin = false
122
+ logger.Info("OnHTML login_form")
123
+ loginURL, err, reqMap := getForm(element, err)
124
+ if err != nil {
125
+ logger.Error(err)
126
+ return
127
+ }
128
+ reqMap["email"] = email
129
+ reqMap["pass"] = password
130
+ logger.Info("req map:", reqMap)
131
+ err = collector.Post(loginURL, reqMap)
132
+ if err != nil {
133
+ logger.Error("post err:", err)
134
+ }
129
135
  }
130
136
  })
131
137
 
@@ -152,9 +158,12 @@ func (f *Fbcolly) Login(email string, password string, otp string) (string, erro
152
158
  //logger.Info("Please input OTP")
153
159
  //reader := bufio.NewReader(os.Stdin)
154
160
  //code, _ := reader.ReadString('\n')
155
- code := otp[0:6]
156
- reqMap["approvals_code"] = code
157
- shouldSubmit = true
161
+ if len(totpSecret) > 0 {
162
+ code := gotp.NewDefaultTOTP(totpSecret).Now()
163
+ reqMap["approvals_code"] = code
164
+ shouldSubmit = true
165
+ }
166
+
158
167
  } else {
159
168
  logger.Info("OnHTML Only Continue checkpoint")
160
169
 
@@ -188,68 +197,124 @@ func (f *Fbcolly) Login(email string, password string, otp string) (string, erro
188
197
 
189
198
  }
190
199
 
191
- func (f *Fbcolly) FetchGroupFeed(groupId int64) (error, *fbcrawl.FacebookPostList) {
200
+ func (f *Fbcolly) FetchGroupFeed(groupId int64, nextCursor string) (error, *pb.FacebookPostList) {
192
201
  collector := f.collector.Clone()
193
202
  err := setupSharedCollector(collector)
194
- currentPage := 1
195
- var result []*fbcrawl.FacebookPost
203
+ result := pb.FacebookPostList{Posts: []*pb.FacebookPost{}}
196
204
 
197
205
  collector.OnHTML("#m_group_stories_container > :last-child a", func(element *colly.HTMLElement) {
198
- currentPage++
199
- if currentPage < 3 {
200
- logger.Info("Will fetch page", currentPage)
201
- err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
206
+ result.NextCursor = "http://mbasic.facebook.com" + element.Attr("href")
207
+ })
208
+ collector.OnHTML("div[role=\"article\"]", func(element *colly.HTMLElement) {
209
+ dataElement := element
210
+ post := &pb.FacebookPost{}
211
+ var fbDataFt FbDataFt
212
+ jsonData := dataElement.Attr("data-ft")
213
+
214
+ logger.Info(jsonData)
215
+ err = json.Unmarshal([]byte(jsonData), &fbDataFt)
216
+ if err != nil {
217
+ logger.Error(err)
218
+ return
219
+ }
220
+ logger.Info("Post ", fbDataFt)
221
+ post.Id = fbDataFt.TopLevelPostId
222
+ post.Group = &pb.FacebookGroup{Id: fbDataFt.PageId, Name: dataElement.DOM.Find("h3 strong:nth-child(2) a").Text()}
223
+ post.User = &pb.FacebookUser{
224
+ Id: fbDataFt.ContentOwnerIdNew,
225
+ Name: dataElement.DOM.Find("h3 strong:nth-child(1) a").Text(),
226
+ }
227
+ post.CreatedAt = fbDataFt.PageInsights[strconv.FormatInt(fbDataFt.PageId, 10)].PublishTime
228
+ //Content
229
+
230
+ //NO BACKGROUND TEXT ONLY
231
+ post.Content = strings.Join(dataElement.DOM.Find("p").Map(func(i int, selection *goquery.Selection) string {
232
+ return selection.Text()
233
+ }), "\n")
234
+
235
+ if len(post.Content) == 0 {
236
+ // TEXT WITH BACKGROUND
237
+ post.Content = dataElement.DOM.Find("div[style*=\"background-image:url\"]").Text()
202
238
  }
239
+
240
+ post.ContentLink = getUrlFromRedirectHref(dataElement.DOM.Find("a[href*=\"https://lm.facebook.com/l.php\"]").AttrOr("href", ""))
241
+ post.ReactionCount = getNumberFromText(element.DOM.Find("span[id*=\"like_\"]").Text())
242
+ post.CommentCount = getNumberFromText(element.DOM.Find("span[id*=\"like_\"] ~ a").Text())
243
+ post.ContentImages = (funk.Map(fbDataFt.PhotoAttachmentsList, func(id string) *pb.FacebookImage {
244
+ i, _ := strconv.ParseInt(id, 10, 64)
245
+ return &pb.FacebookImage{
246
+ Id: i,
247
+ }
248
+ })).([]*pb.FacebookImage)
249
+
250
+ if fbDataFt.PhotoId > 0 {
251
+ post.ContentImage = &pb.FacebookImage{Id: fbDataFt.PhotoId}
252
+ }
253
+ result.Posts = append(result.Posts, post)
203
254
  })
255
+ if len(nextCursor) > 0 {
256
+ err = collector.Visit(nextCursor)
257
+ } else {
258
+ err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/groups/%d", groupId))
259
+ }
204
260
 
205
- collector.OnXML("//a[text()=\"Full Story\"]", func(element *colly.XMLElement) {
206
- logger.Info("Post found at", element.Attr("href"))
207
- u, _ := url.Parse("http://mbasic.facebook.com" + element.Attr("href"))
208
- postId, _ := strconv.ParseInt(u.Query().Get("id"), 10, 64)
209
- result = append(result, &fbcrawl.FacebookPost{
210
- Id: postId,
211
- Group: &fbcrawl.FacebookGroup{Id: groupId},
212
- })
213
- //f.detailCollector.Visit(url)
261
+ if err != nil {
262
+ logger.Error("crawl by colly err:", err)
263
+ }
264
+ return err, &result
265
+ }
266
+
267
+ func (f *Fbcolly) FetchGroupInfo(groupIdOrUsername string) (error, *pb.FacebookGroup) {
268
+ collector := f.collector.Clone()
269
+ err := setupSharedCollector(collector)
270
+ result := &pb.FacebookGroup{}
271
+
272
+ collector.OnHTML("a[href=\"#groupMenuBottom\"] h1", func(element *colly.HTMLElement) {
273
+ result.Name = element.Text
274
+ })
275
+ collector.OnHTML("a[href*=\"view=member\"]", func(element *colly.HTMLElement) {
276
+ result.Id = getNumberFromText(element.Attr("href"))
277
+ result.MemberCount, _ = strconv.ParseInt(element.DOM.Closest("tr").Find("td:last-child").Text(), 10, 64)
214
278
  })
215
279
 
216
- err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/groups/%d", groupId))
280
+ err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/groups/%s?view=info", groupIdOrUsername))
217
281
  if err != nil {
218
282
  logger.Error("crawl by colly err:", err)
219
283
  }
220
- return err, &fbcrawl.FacebookPostList{Posts: result}
284
+ return err, result
221
285
  }
222
286
 
223
- func (f *Fbcolly) FetchContentImages(postId int64) (error, *fbcrawl.FacebookImageList) {
287
+ func (f *Fbcolly) FetchContentImages(postId int64, nextCursor string) (error, *pb.FacebookImageList) {
224
288
  collector := f.collector.Clone()
225
289
  err := setupSharedCollector(collector)
226
- currentPage := 1
227
- var result []*fbcrawl.FacebookImage
290
+ result := pb.FacebookImageList{Images: []*pb.FacebookImage{}}
228
291
 
229
292
  collector.OnHTML("a[href*=\"/media/set/\"]", func(element *colly.HTMLElement) {
230
- currentPage++
231
- logger.Info("Will fetch page", currentPage)
232
- err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
293
+ result.NextCursor = "http://mbasic.facebook.com" + element.Attr("href")
233
294
  })
234
295
 
235
296
  collector.OnHTML("a[href*=\"/photo.php\"]", func(element *colly.HTMLElement) {
236
- result = append(result, &fbcrawl.FacebookImage{
297
+ result.Images = append(result.Images, &pb.FacebookImage{
237
298
  Id: getImageIdFromHref(element.Attr("href")),
238
299
  })
239
300
  //f.detailCollector.Visit(url)
240
301
  })
302
+ if len(nextCursor) > 0 {
303
+ err = collector.Visit(nextCursor)
304
+ } else {
305
+ err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/media/set/?set=pcb.%d", postId))
306
+ }
241
307
 
242
- err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/media/set/?set=pcb.%d", postId))
243
308
  if err != nil {
244
309
  logger.Error("crawl by colly err:", err)
245
310
  }
246
- return err, &fbcrawl.FacebookImageList{Images: result}
311
+ return err, &result
247
312
  }
248
313
 
249
- func (f *Fbcolly) FetchImageUrl(imageId int64) (error, *fbcrawl.FacebookImage) {
314
+ func (f *Fbcolly) FetchImageUrl(imageId int64) (error, *pb.FacebookImage) {
250
315
  collector := f.collector.Clone()
251
316
  err := setupSharedCollector(collector)
252
- result := fbcrawl.FacebookImage{Id: imageId}
317
+ result := pb.FacebookImage{Id: imageId}
253
318
 
254
319
  collector.OnHTML("a", func(element *colly.HTMLElement) {
255
320
  result.Url = element.Attr("href")
@@ -262,11 +327,10 @@ func (f *Fbcolly) FetchImageUrl(imageId int64) (error, *fbcrawl.FacebookImage) {
262
327
  return err, &result
263
328
  }
264
329
 
265
- func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.FacebookPost) {
330
+ func (f *Fbcolly) FetchPost(groupId int64, postId int64, commentNextCursor string) (error, *pb.FacebookPost) {
266
331
  collector := f.collector.Clone()
267
332
  err := setupSharedCollector(collector)
268
- post := &fbcrawl.FacebookPost{Comments: []*fbcrawl.FacebookComment{}}
269
- commentPaging := 0
333
+ post := &pb.FacebookPost{Comments: &pb.CommentList{Comments: []*pb.FacebookComment{}}}
270
334
  collector.OnHTML("#m_story_permalink_view", func(element *colly.HTMLElement) {
271
335
  dataElement := element.DOM.Find("div[data-ft]")
272
336
  if dataElement.Length() > 0 {
@@ -281,8 +345,8 @@ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.Facebo
281
345
  }
282
346
  logger.Info("Post ", result)
283
347
  post.Id = result.TopLevelPostId
284
- post.Group = &fbcrawl.FacebookGroup{Id: result.PageId, Name: dataElement.Find("h3 strong:last-child a").Text()}
285
- post.User = &fbcrawl.FacebookUser{
348
+ post.Group = &pb.FacebookGroup{Id: result.PageId, Name: dataElement.Find("h3 strong:last-child a").Text()}
349
+ post.User = &pb.FacebookUser{
286
350
  Id: result.ContentOwnerIdNew,
287
351
  Name: dataElement.Find("h3 strong:first-child a").Text(),
288
352
  }
@@ -300,21 +364,17 @@ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.Facebo
300
364
  }
301
365
 
302
366
  post.ContentLink = getUrlFromRedirectHref(dataElement.Find("a[href*=\"https://lm.facebook.com/l.php\"]").AttrOr("href", ""))
303
- post.ReactionCount = getReactionFromText(element.DOM.Find("div[id*=\"sentence_\"]").Text())
304
- post.ContentImages = (funk.Map(result.PhotoAttachmentsList, func(id string) *fbcrawl.FacebookImage {
367
+ post.ReactionCount = getNumberFromText(element.DOM.Find("div[id*=\"sentence_\"]").Text())
368
+ post.ContentImages = (funk.Map(result.PhotoAttachmentsList, func(id string) *pb.FacebookImage {
305
369
  i, _ := strconv.ParseInt(id, 10, 64)
306
- return &fbcrawl.FacebookImage{
370
+ return &pb.FacebookImage{
307
371
  Id: i,
308
372
  }
309
- })).([]*fbcrawl.FacebookImage)
373
+ })).([]*pb.FacebookImage)
310
374
 
311
375
  if result.PhotoId > 0 {
312
- post.ContentImage = &fbcrawl.FacebookImage{Id: result.PhotoId}
376
+ post.ContentImage = &pb.FacebookImage{Id: result.PhotoId}
313
377
  }
314
-
315
- logger.Info("content", strings.Join(dataElement.Find("p").Map(func(i int, selection *goquery.Selection) string {
316
- return selection.Text()
317
- }), "\n"))
318
378
  }
319
379
 
320
380
  //Comment
@@ -323,10 +383,10 @@ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.Facebo
323
383
  commentId, _ := strconv.ParseInt(selection.AttrOr("id", ""), 10, 64)
324
384
  logger.Info("comment", commentId)
325
385
  createdAtWhenResult, _ := f.w.Parse(selection.Find("abbr").Text(), time.Now())
326
- post.Comments = append(post.Comments, &fbcrawl.FacebookComment{
386
+ post.Comments.Comments = append(post.Comments.Comments, &pb.FacebookComment{
327
387
  Id: commentId,
328
- Post: &fbcrawl.FacebookPost{Id: post.Id},
329
- User: &fbcrawl.FacebookUser{
388
+ Post: &pb.FacebookPost{Id: post.Id},
389
+ User: &pb.FacebookUser{
330
390
  Id: getUserIdFromCommentHref(selection.Find("a[href*=\"#comment_form_\"]").AttrOr("href", "")),
331
391
  Name: selection.Find("h3 > a").Text(),
332
392
  },
@@ -339,14 +399,14 @@ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.Facebo
339
399
  })
340
400
 
341
401
  collector.OnHTML("div[id*=\"see_prev_\"] > a", func(element *colly.HTMLElement) {
342
- if commentPaging < 3 {
343
- logger.Info("Comment paging", commentPaging)
344
- err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
345
- commentPaging = commentPaging + 1
346
- }
402
+ post.Comments.NextCursor = "http://mbasic.facebook.com" + element.Attr("href")
347
403
  })
404
+ if len(commentNextCursor) > 0 {
405
+ err = collector.Visit(commentNextCursor)
406
+ } else {
407
+ err = collector.Visit(fmt.Sprintf("http://mbasic.facebook.com/groups/%d?view=permalink&id=%d&_rdr", groupId, postId))
408
+ }
348
409
 
349
- err = collector.Visit(fmt.Sprintf("http://mbasic.facebook.com/groups/%d?view=permalink&id=%d&_rdr", groupId, postId))
350
410
  return err, post
351
411
  }
352
412
 
@@ -360,8 +420,12 @@ func (f *Fbcolly) LoginWithCookies(cookies string) error {
360
420
  //}
361
421
 
362
422
  func getUserIdFromCommentHref(href string) int64 {
363
- id, _ := strconv.ParseInt(regexp.MustCompile("#comment_form_(\\d+)").FindStringSubmatch(href)[1], 10, 64)
364
- return id
423
+ match := regexp.MustCompile("#comment_form_(\\d+)").FindStringSubmatch(href)
424
+ if len(match) > 0 {
425
+ id, _ := strconv.ParseInt(match[1], 10, 64)
426
+ return id
427
+ }
428
+ return 0
365
429
  }
366
430
 
367
431
  func getUrlFromRedirectHref(href string) string {
@@ -375,11 +439,20 @@ func getImageIdFromHref(href string) int64 {
375
439
  return i
376
440
  }
377
441
 
378
- func getReactionFromText(text string) int64 {
442
+ func getNumberFromText(text string) int64 {
379
443
  logger.Error("reaction", text)
380
444
  if len(text) > 0 {
381
- id, _ := strconv.ParseInt(regexp.MustCompile("(\\d+)").FindStringSubmatch(text)[1], 10, 64)
382
- return id
445
+ match := regexp.MustCompile("(\\d+)\\s?([km]?)").FindStringSubmatch(text)
446
+ if len(match) > 0 {
447
+ count, _ := strconv.ParseInt(match[1], 10, 64)
448
+ switch match[2] {
449
+ case "k":
450
+ count *= 1000
451
+ case "m":
452
+ count *= 1000000
453
+ }
454
+ return count
455
+ }
383
456
  }
384
457
  return 0
385
458
  }