fbcrawl-colly 0.2.1 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5a3dd903339ecec17525ca24b86324664458ef884f70065f611dc5bc15caf0a6
4
- data.tar.gz: 7d7fbfd082240d21fc66f100eb530f81579b62c715181a10b95b1c70f5ad1d98
3
+ metadata.gz: 00d52fcc50bd651adaa80cf6102d7cc3997a07605ebb91fd33006b6e535ed6fc
4
+ data.tar.gz: 24388fa7c98d6dfbe58ba195c330ea3c363f6ef8e586873dd9e485ad29907a6c
5
5
  SHA512:
6
- metadata.gz: c5a3b0dcd89eb3b94adcc797785984e04a122a51f9135a37aed10aee0607dd357fe99ebd69e8f2eb0ebeaf05ff4c1f0bf21c4713d173065f4782dbeb192c365e
7
- data.tar.gz: d01e881175776aa5039859b1b11d723c3a577a4a68f54b6bb550d0c8c90c87d89004a94b05f21846763d56687d65bc9132e602f61a94fcebb25e934dca93ab40
6
+ metadata.gz: 15622279b09030f929218482add87878b58fdcb1e89e1ab8a1124531435707dbe26367fee75204a55fc88f8486c47cbb6ddfdb86035aa24694b1b7d6639aaecc
7
+ data.tar.gz: df50b30ad0234c824505cca54fdf33d0be54c4725c684f0317b1eba81f89d22d7b434bec4e9373b3d43dd830dc17693ac2fb785e75cefc71c6cbeefc44da2616
data/.gitignore CHANGED
@@ -12,8 +12,6 @@
12
12
  /parse.log
13
13
  last.html
14
14
  *.db
15
- /fbcrawl/fbcrawl.pb.go
16
- /lib/fbcrawl_pb.rb
17
15
 
18
16
  mkmf.log
19
17
  .rakeTasks
@@ -0,0 +1,14 @@
1
+ FROM golang:1.14-alpine
2
+ RUN apk add --no-cache git build-base tzdata
3
+
4
+ RUN mkdir -p /app
5
+ WORKDIR /app
6
+ ADD ./go.mod /app
7
+ ADD ./go.sum /app
8
+ ADD ./ /app
9
+ RUN go get
10
+
11
+
12
+ ENV PORT 3000
13
+ RUN go build -o server qnetwork.net/fbcrawl
14
+ ENTRYPOINT ["./server"]
@@ -1,19 +1,21 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- fbcrawl-colly (0.2.1)
5
- ffi
4
+ fbcrawl-colly (1.0.0)
6
5
  google-protobuf
6
+ grpc
7
7
 
8
8
  GEM
9
9
  remote: https://rubygems.org/
10
10
  specs:
11
- ffi (1.13.1)
12
- google-protobuf (3.12.4)
11
+ google-protobuf (3.13.0-universal-darwin)
12
+ googleapis-common-protos-types (1.0.5)
13
+ google-protobuf (~> 3.11)
14
+ grpc (1.30.2-universal-darwin)
15
+ google-protobuf (~> 3.12)
16
+ googleapis-common-protos-types (~> 1.0)
13
17
  minitest (5.14.1)
14
18
  rake (12.3.3)
15
- rake-compiler (1.1.1)
16
- rake
17
19
 
18
20
  PLATFORMS
19
21
  ruby
@@ -22,7 +24,6 @@ DEPENDENCIES
22
24
  fbcrawl-colly!
23
25
  minitest (~> 5.0)
24
26
  rake (~> 12.0)
25
- rake-compiler
26
27
 
27
28
  BUNDLED WITH
28
29
  2.1.4
data/README.md CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  This project is to crawl mbasic.facebook.com using GO Colly.
4
4
 
5
- Ruby gem will available to use
5
+ Ruby gem is only client for GRPC colly service
6
6
 
7
7
  ## Installation
8
8
 
@@ -25,12 +25,8 @@ Gem::Specification.new do |spec|
25
25
  end
26
26
  # spec.bindir = "exe"
27
27
  # spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
28
- spec.extensions = [
29
- 'ext/fbcrawl_colly/extconf.rb'
30
- ]
31
- spec.require_paths = ["lib"]
28
+ spec.require_paths = %w[lib lib/pb]
32
29
 
33
- spec.add_runtime_dependency 'ffi'
34
30
  spec.add_runtime_dependency 'google-protobuf'
35
- spec.add_development_dependency 'rake-compiler'
31
+ spec.add_runtime_dependency 'grpc'
36
32
  end
@@ -1,12 +1,79 @@
1
1
  syntax = "proto3";
2
2
 
3
3
  package fbcrawl_colly;
4
- option go_package = "./fbcrawl;fbcrawl";
4
+ option go_package = "./fbcrawl/pb;pb";
5
+
6
+ service Grpc {
7
+ // Sends a greeting
8
+ rpc Init (Empty) returns (Pointer) {}
9
+ rpc FreeColly (Pointer) returns (Empty) {}
10
+ rpc Login (LoginRequest) returns (LoginResponse) {}
11
+ rpc LoginWithCookies (LoginWithCookiesRequest) returns (Empty) {}
12
+ rpc FetchGroupInfo (FetchGroupInfoRequest) returns (FacebookGroup) {}
13
+ rpc FetchGroupFeed (FetchGroupFeedRequest) returns (FacebookPostList) {}
14
+ rpc FetchPost (FetchPostRequest) returns (FacebookPost) {}
15
+ rpc FetchContentImages (FetchContentImagesRequest) returns (FacebookImageList) {}
16
+ rpc FetchImageUrl (FetchImageUrlRequest) returns (FacebookImage) {}
17
+ }
18
+
19
+ message Empty {
20
+
21
+ }
22
+
23
+ message Pointer {
24
+ int64 address = 1;
25
+ }
26
+
27
+ message LoginRequest {
28
+ Pointer pointer = 1;
29
+ string email = 2;
30
+ string password = 3;
31
+ string totp_secret = 4;
32
+ }
33
+
34
+ message LoginResponse {
35
+ string cookies = 1;
36
+ }
37
+
38
+ message LoginWithCookiesRequest {
39
+ Pointer pointer = 1;
40
+ string cookies = 2;
41
+ }
42
+
43
+ message FetchGroupInfoRequest {
44
+ Pointer pointer = 1;
45
+ string group_username = 2;
46
+ }
47
+
48
+ message FetchGroupFeedRequest {
49
+ Pointer pointer = 1;
50
+ int64 group_id = 2;
51
+ string next_cursor = 3;
52
+ }
53
+
54
+ message FetchPostRequest {
55
+ Pointer pointer = 1;
56
+ int64 group_id = 2;
57
+ int64 post_id = 3;
58
+ string comment_next_cursor = 4;
59
+ }
60
+
61
+ message FetchContentImagesRequest {
62
+ Pointer pointer = 1;
63
+ int64 post_id = 2;
64
+ string next_cursor = 3;
65
+ }
66
+
67
+ message FetchImageUrlRequest {
68
+ Pointer pointer = 1;
69
+ int64 image_id = 2;
70
+ }
5
71
 
6
72
  // The request message containing the user's name.
7
73
  message FacebookGroup {
8
74
  int64 id = 1;
9
75
  string name = 2;
76
+ int64 member_count = 3;
10
77
  }
11
78
 
12
79
  message FacebookUser {
@@ -19,12 +86,18 @@ message FacebookPost {
19
86
  FacebookGroup group = 2;
20
87
  FacebookUser user = 3;
21
88
  string content = 4;
89
+ CommentList comments = 5;
22
90
  string content_link = 6;
23
- FacebookImage content_image = 8;
24
91
  repeated FacebookImage content_images = 7;
25
- repeated FacebookComment comments = 5;
92
+ FacebookImage content_image = 8;
26
93
  int64 created_at = 9;
27
94
  int64 reaction_count = 10;
95
+ int64 comment_count = 11;
96
+ }
97
+
98
+ message CommentList {
99
+ repeated FacebookComment comments = 5;
100
+ string next_cursor = 12;
28
101
  }
29
102
 
30
103
  message FacebookImage {
@@ -42,8 +115,10 @@ message FacebookComment {
42
115
 
43
116
  message FacebookPostList {
44
117
  repeated FacebookPost posts = 1;
118
+ string next_cursor = 2;
45
119
  }
46
120
 
47
121
  message FacebookImageList {
48
122
  repeated FacebookImage images = 1;
123
+ string next_cursor = 2;
49
124
  }
@@ -6,16 +6,18 @@ import (
6
6
  "errors"
7
7
  "fmt"
8
8
  "github.com/PuerkitoBio/goquery"
9
- "github.com/gocolly/colly"
10
- "github.com/gocolly/colly/extensions"
11
- "github.com/gocolly/colly/storage"
9
+ "github.com/gocolly/colly/v2"
10
+ "github.com/gocolly/colly/v2/debug"
11
+ "github.com/gocolly/colly/v2/extensions"
12
+ "github.com/gocolly/colly/v2/storage"
12
13
  "github.com/google/logger"
13
14
  "github.com/olebedev/when"
14
15
  "github.com/olebedev/when/rules/common"
15
16
  "github.com/olebedev/when/rules/en"
16
17
  "github.com/thoas/go-funk"
18
+ "github.com/xlzd/gotp"
17
19
  "net/url"
18
- "qnetwork.net/fbcrawl/fbcrawl"
20
+ "qnetwork.net/fbcrawl/fbcrawl/pb"
19
21
  "regexp"
20
22
  "strconv"
21
23
  "strings"
@@ -42,7 +44,7 @@ type FbDataFt struct {
42
44
  }
43
45
 
44
46
  func sharedOnRequest(request *colly.Request) {
45
- logger.Info("OnRequest")
47
+ logger.Info("OnRequest ", request.URL)
46
48
  //request.Headers.Set("Host", "facebook.com")
47
49
  request.Headers.Set("Accept-Language", "en-US,en;q=0.9")
48
50
  request.Headers.Set("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
@@ -62,9 +64,10 @@ func sharedOnRequest(request *colly.Request) {
62
64
  func setupSharedCollector(collector *colly.Collector) error {
63
65
  var err error
64
66
  extensions.Referer(collector)
65
-
67
+ collector.AllowURLRevisit = true
66
68
  collector.OnRequest(sharedOnRequest)
67
69
  collector.OnResponse(sharedOnResponse)
70
+ collector.SetDebugger(&debug.LogDebugger{})
68
71
  collector.OnError(func(resp *colly.Response, errHttp error) {
69
72
  err = errHttp
70
73
  logger.Error("OnError", err)
@@ -106,26 +109,29 @@ func New() *Fbcolly {
106
109
  return &f
107
110
  }
108
111
 
109
- func (f *Fbcolly) Login(email string, password string, otp string) (string, error) {
112
+ func (f *Fbcolly) Login(email string, password string, totpSecret string) (string, error) {
110
113
  collector := f.collector.Clone()
111
114
  err := setupSharedCollector(collector)
112
115
 
113
116
  logger.Info("Login using email", email)
114
117
  loggedIn := false
115
-
118
+ firstLogin := true
116
119
  collector.OnHTML("#login_form", func(element *colly.HTMLElement) {
117
- logger.Info("OnHTML login_form")
118
- loginURL, err, reqMap := getForm(element, err)
119
- if err != nil {
120
- logger.Error(err)
121
- return
122
- }
123
- reqMap["email"] = email
124
- reqMap["pass"] = password
125
- logger.Info("req map:", reqMap)
126
- err = collector.Post(loginURL, reqMap)
127
- if err != nil {
128
- logger.Error("post err:", err)
120
+ if firstLogin {
121
+ firstLogin = false
122
+ logger.Info("OnHTML login_form")
123
+ loginURL, err, reqMap := getForm(element, err)
124
+ if err != nil {
125
+ logger.Error(err)
126
+ return
127
+ }
128
+ reqMap["email"] = email
129
+ reqMap["pass"] = password
130
+ logger.Info("req map:", reqMap)
131
+ err = collector.Post(loginURL, reqMap)
132
+ if err != nil {
133
+ logger.Error("post err:", err)
134
+ }
129
135
  }
130
136
  })
131
137
 
@@ -152,9 +158,12 @@ func (f *Fbcolly) Login(email string, password string, otp string) (string, erro
152
158
  //logger.Info("Please input OTP")
153
159
  //reader := bufio.NewReader(os.Stdin)
154
160
  //code, _ := reader.ReadString('\n')
155
- code := otp[0:6]
156
- reqMap["approvals_code"] = code
157
- shouldSubmit = true
161
+ if len(totpSecret) > 0 {
162
+ code := gotp.NewDefaultTOTP(totpSecret).Now()
163
+ reqMap["approvals_code"] = code
164
+ shouldSubmit = true
165
+ }
166
+
158
167
  } else {
159
168
  logger.Info("OnHTML Only Continue checkpoint")
160
169
 
@@ -188,68 +197,124 @@ func (f *Fbcolly) Login(email string, password string, otp string) (string, erro
188
197
 
189
198
  }
190
199
 
191
- func (f *Fbcolly) FetchGroupFeed(groupId int64) (error, *fbcrawl.FacebookPostList) {
200
+ func (f *Fbcolly) FetchGroupFeed(groupId int64, nextCursor string) (error, *pb.FacebookPostList) {
192
201
  collector := f.collector.Clone()
193
202
  err := setupSharedCollector(collector)
194
- currentPage := 1
195
- var result []*fbcrawl.FacebookPost
203
+ result := pb.FacebookPostList{Posts: []*pb.FacebookPost{}}
196
204
 
197
205
  collector.OnHTML("#m_group_stories_container > :last-child a", func(element *colly.HTMLElement) {
198
- currentPage++
199
- if currentPage < 3 {
200
- logger.Info("Will fetch page", currentPage)
201
- err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
206
+ result.NextCursor = "http://mbasic.facebook.com" + element.Attr("href")
207
+ })
208
+ collector.OnHTML("div[role=\"article\"]", func(element *colly.HTMLElement) {
209
+ dataElement := element
210
+ post := &pb.FacebookPost{}
211
+ var fbDataFt FbDataFt
212
+ jsonData := dataElement.Attr("data-ft")
213
+
214
+ logger.Info(jsonData)
215
+ err = json.Unmarshal([]byte(jsonData), &fbDataFt)
216
+ if err != nil {
217
+ logger.Error(err)
218
+ return
219
+ }
220
+ logger.Info("Post ", fbDataFt)
221
+ post.Id = fbDataFt.TopLevelPostId
222
+ post.Group = &pb.FacebookGroup{Id: fbDataFt.PageId, Name: dataElement.DOM.Find("h3 strong:nth-child(2) a").Text()}
223
+ post.User = &pb.FacebookUser{
224
+ Id: fbDataFt.ContentOwnerIdNew,
225
+ Name: dataElement.DOM.Find("h3 strong:nth-child(1) a").Text(),
226
+ }
227
+ post.CreatedAt = fbDataFt.PageInsights[strconv.FormatInt(fbDataFt.PageId, 10)].PublishTime
228
+ //Content
229
+
230
+ //NO BACKGROUND TEXT ONLY
231
+ post.Content = strings.Join(dataElement.DOM.Find("p").Map(func(i int, selection *goquery.Selection) string {
232
+ return selection.Text()
233
+ }), "\n")
234
+
235
+ if len(post.Content) == 0 {
236
+ // TEXT WITH BACKGROUND
237
+ post.Content = dataElement.DOM.Find("div[style*=\"background-image:url\"]").Text()
202
238
  }
239
+
240
+ post.ContentLink = getUrlFromRedirectHref(dataElement.DOM.Find("a[href*=\"https://lm.facebook.com/l.php\"]").AttrOr("href", ""))
241
+ post.ReactionCount = getNumberFromText(element.DOM.Find("span[id*=\"like_\"]").Text())
242
+ post.CommentCount = getNumberFromText(element.DOM.Find("span[id*=\"like_\"] ~ a").Text())
243
+ post.ContentImages = (funk.Map(fbDataFt.PhotoAttachmentsList, func(id string) *pb.FacebookImage {
244
+ i, _ := strconv.ParseInt(id, 10, 64)
245
+ return &pb.FacebookImage{
246
+ Id: i,
247
+ }
248
+ })).([]*pb.FacebookImage)
249
+
250
+ if fbDataFt.PhotoId > 0 {
251
+ post.ContentImage = &pb.FacebookImage{Id: fbDataFt.PhotoId}
252
+ }
253
+ result.Posts = append(result.Posts, post)
203
254
  })
255
+ if len(nextCursor) > 0 {
256
+ err = collector.Visit(nextCursor)
257
+ } else {
258
+ err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/groups/%d", groupId))
259
+ }
204
260
 
205
- collector.OnXML("//a[text()=\"Full Story\"]", func(element *colly.XMLElement) {
206
- logger.Info("Post found at", element.Attr("href"))
207
- u, _ := url.Parse("http://mbasic.facebook.com" + element.Attr("href"))
208
- postId, _ := strconv.ParseInt(u.Query().Get("id"), 10, 64)
209
- result = append(result, &fbcrawl.FacebookPost{
210
- Id: postId,
211
- Group: &fbcrawl.FacebookGroup{Id: groupId},
212
- })
213
- //f.detailCollector.Visit(url)
261
+ if err != nil {
262
+ logger.Error("crawl by colly err:", err)
263
+ }
264
+ return err, &result
265
+ }
266
+
267
+ func (f *Fbcolly) FetchGroupInfo(groupIdOrUsername string) (error, *pb.FacebookGroup) {
268
+ collector := f.collector.Clone()
269
+ err := setupSharedCollector(collector)
270
+ result := &pb.FacebookGroup{}
271
+
272
+ collector.OnHTML("a[href=\"#groupMenuBottom\"] h1", func(element *colly.HTMLElement) {
273
+ result.Name = element.Text
274
+ })
275
+ collector.OnHTML("a[href*=\"view=member\"]", func(element *colly.HTMLElement) {
276
+ result.Id = getNumberFromText(element.Attr("href"))
277
+ result.MemberCount, _ = strconv.ParseInt(element.DOM.Closest("tr").Find("td:last-child").Text(), 10, 64)
214
278
  })
215
279
 
216
- err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/groups/%d", groupId))
280
+ err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/groups/%s?view=info", groupIdOrUsername))
217
281
  if err != nil {
218
282
  logger.Error("crawl by colly err:", err)
219
283
  }
220
- return err, &fbcrawl.FacebookPostList{Posts: result}
284
+ return err, result
221
285
  }
222
286
 
223
- func (f *Fbcolly) FetchContentImages(postId int64) (error, *fbcrawl.FacebookImageList) {
287
+ func (f *Fbcolly) FetchContentImages(postId int64, nextCursor string) (error, *pb.FacebookImageList) {
224
288
  collector := f.collector.Clone()
225
289
  err := setupSharedCollector(collector)
226
- currentPage := 1
227
- var result []*fbcrawl.FacebookImage
290
+ result := pb.FacebookImageList{Images: []*pb.FacebookImage{}}
228
291
 
229
292
  collector.OnHTML("a[href*=\"/media/set/\"]", func(element *colly.HTMLElement) {
230
- currentPage++
231
- logger.Info("Will fetch page", currentPage)
232
- err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
293
+ result.NextCursor = "http://mbasic.facebook.com" + element.Attr("href")
233
294
  })
234
295
 
235
296
  collector.OnHTML("a[href*=\"/photo.php\"]", func(element *colly.HTMLElement) {
236
- result = append(result, &fbcrawl.FacebookImage{
297
+ result.Images = append(result.Images, &pb.FacebookImage{
237
298
  Id: getImageIdFromHref(element.Attr("href")),
238
299
  })
239
300
  //f.detailCollector.Visit(url)
240
301
  })
302
+ if len(nextCursor) > 0 {
303
+ err = collector.Visit(nextCursor)
304
+ } else {
305
+ err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/media/set/?set=pcb.%d", postId))
306
+ }
241
307
 
242
- err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/media/set/?set=pcb.%d", postId))
243
308
  if err != nil {
244
309
  logger.Error("crawl by colly err:", err)
245
310
  }
246
- return err, &fbcrawl.FacebookImageList{Images: result}
311
+ return err, &result
247
312
  }
248
313
 
249
- func (f *Fbcolly) FetchImageUrl(imageId int64) (error, *fbcrawl.FacebookImage) {
314
+ func (f *Fbcolly) FetchImageUrl(imageId int64) (error, *pb.FacebookImage) {
250
315
  collector := f.collector.Clone()
251
316
  err := setupSharedCollector(collector)
252
- result := fbcrawl.FacebookImage{Id: imageId}
317
+ result := pb.FacebookImage{Id: imageId}
253
318
 
254
319
  collector.OnHTML("a", func(element *colly.HTMLElement) {
255
320
  result.Url = element.Attr("href")
@@ -262,11 +327,10 @@ func (f *Fbcolly) FetchImageUrl(imageId int64) (error, *fbcrawl.FacebookImage) {
262
327
  return err, &result
263
328
  }
264
329
 
265
- func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.FacebookPost) {
330
+ func (f *Fbcolly) FetchPost(groupId int64, postId int64, commentNextCursor string) (error, *pb.FacebookPost) {
266
331
  collector := f.collector.Clone()
267
332
  err := setupSharedCollector(collector)
268
- post := &fbcrawl.FacebookPost{Comments: []*fbcrawl.FacebookComment{}}
269
- commentPaging := 0
333
+ post := &pb.FacebookPost{Comments: &pb.CommentList{Comments: []*pb.FacebookComment{}}}
270
334
  collector.OnHTML("#m_story_permalink_view", func(element *colly.HTMLElement) {
271
335
  dataElement := element.DOM.Find("div[data-ft]")
272
336
  if dataElement.Length() > 0 {
@@ -281,8 +345,8 @@ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.Facebo
281
345
  }
282
346
  logger.Info("Post ", result)
283
347
  post.Id = result.TopLevelPostId
284
- post.Group = &fbcrawl.FacebookGroup{Id: result.PageId, Name: dataElement.Find("h3 strong:last-child a").Text()}
285
- post.User = &fbcrawl.FacebookUser{
348
+ post.Group = &pb.FacebookGroup{Id: result.PageId, Name: dataElement.Find("h3 strong:last-child a").Text()}
349
+ post.User = &pb.FacebookUser{
286
350
  Id: result.ContentOwnerIdNew,
287
351
  Name: dataElement.Find("h3 strong:first-child a").Text(),
288
352
  }
@@ -300,21 +364,17 @@ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.Facebo
300
364
  }
301
365
 
302
366
  post.ContentLink = getUrlFromRedirectHref(dataElement.Find("a[href*=\"https://lm.facebook.com/l.php\"]").AttrOr("href", ""))
303
- post.ReactionCount = getReactionFromText(element.DOM.Find("div[id*=\"sentence_\"]").Text())
304
- post.ContentImages = (funk.Map(result.PhotoAttachmentsList, func(id string) *fbcrawl.FacebookImage {
367
+ post.ReactionCount = getNumberFromText(element.DOM.Find("div[id*=\"sentence_\"]").Text())
368
+ post.ContentImages = (funk.Map(result.PhotoAttachmentsList, func(id string) *pb.FacebookImage {
305
369
  i, _ := strconv.ParseInt(id, 10, 64)
306
- return &fbcrawl.FacebookImage{
370
+ return &pb.FacebookImage{
307
371
  Id: i,
308
372
  }
309
- })).([]*fbcrawl.FacebookImage)
373
+ })).([]*pb.FacebookImage)
310
374
 
311
375
  if result.PhotoId > 0 {
312
- post.ContentImage = &fbcrawl.FacebookImage{Id: result.PhotoId}
376
+ post.ContentImage = &pb.FacebookImage{Id: result.PhotoId}
313
377
  }
314
-
315
- logger.Info("content", strings.Join(dataElement.Find("p").Map(func(i int, selection *goquery.Selection) string {
316
- return selection.Text()
317
- }), "\n"))
318
378
  }
319
379
 
320
380
  //Comment
@@ -323,10 +383,10 @@ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.Facebo
323
383
  commentId, _ := strconv.ParseInt(selection.AttrOr("id", ""), 10, 64)
324
384
  logger.Info("comment", commentId)
325
385
  createdAtWhenResult, _ := f.w.Parse(selection.Find("abbr").Text(), time.Now())
326
- post.Comments = append(post.Comments, &fbcrawl.FacebookComment{
386
+ post.Comments.Comments = append(post.Comments.Comments, &pb.FacebookComment{
327
387
  Id: commentId,
328
- Post: &fbcrawl.FacebookPost{Id: post.Id},
329
- User: &fbcrawl.FacebookUser{
388
+ Post: &pb.FacebookPost{Id: post.Id},
389
+ User: &pb.FacebookUser{
330
390
  Id: getUserIdFromCommentHref(selection.Find("a[href*=\"#comment_form_\"]").AttrOr("href", "")),
331
391
  Name: selection.Find("h3 > a").Text(),
332
392
  },
@@ -339,14 +399,14 @@ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.Facebo
339
399
  })
340
400
 
341
401
  collector.OnHTML("div[id*=\"see_prev_\"] > a", func(element *colly.HTMLElement) {
342
- if commentPaging < 3 {
343
- logger.Info("Comment paging", commentPaging)
344
- err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
345
- commentPaging = commentPaging + 1
346
- }
402
+ post.Comments.NextCursor = "http://mbasic.facebook.com" + element.Attr("href")
347
403
  })
404
+ if len(commentNextCursor) > 0 {
405
+ err = collector.Visit(commentNextCursor)
406
+ } else {
407
+ err = collector.Visit(fmt.Sprintf("http://mbasic.facebook.com/groups/%d?view=permalink&id=%d&_rdr", groupId, postId))
408
+ }
348
409
 
349
- err = collector.Visit(fmt.Sprintf("http://mbasic.facebook.com/groups/%d?view=permalink&id=%d&_rdr", groupId, postId))
350
410
  return err, post
351
411
  }
352
412
 
@@ -360,8 +420,12 @@ func (f *Fbcolly) LoginWithCookies(cookies string) error {
360
420
  //}
361
421
 
362
422
  func getUserIdFromCommentHref(href string) int64 {
363
- id, _ := strconv.ParseInt(regexp.MustCompile("#comment_form_(\\d+)").FindStringSubmatch(href)[1], 10, 64)
364
- return id
423
+ match := regexp.MustCompile("#comment_form_(\\d+)").FindStringSubmatch(href)
424
+ if len(match) > 0 {
425
+ id, _ := strconv.ParseInt(match[1], 10, 64)
426
+ return id
427
+ }
428
+ return 0
365
429
  }
366
430
 
367
431
  func getUrlFromRedirectHref(href string) string {
@@ -375,11 +439,20 @@ func getImageIdFromHref(href string) int64 {
375
439
  return i
376
440
  }
377
441
 
378
- func getReactionFromText(text string) int64 {
442
+ func getNumberFromText(text string) int64 {
379
443
  logger.Error("reaction", text)
380
444
  if len(text) > 0 {
381
- id, _ := strconv.ParseInt(regexp.MustCompile("(\\d+)").FindStringSubmatch(text)[1], 10, 64)
382
- return id
445
+ match := regexp.MustCompile("(\\d+)\\s?([km]?)").FindStringSubmatch(text)
446
+ if len(match) > 0 {
447
+ count, _ := strconv.ParseInt(match[1], 10, 64)
448
+ switch match[2] {
449
+ case "k":
450
+ count *= 1000
451
+ case "m":
452
+ count *= 1000000
453
+ }
454
+ return count
455
+ }
383
456
  }
384
457
  return 0
385
458
  }