fbcrawl-colly 0.2.2 → 0.2.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 383de209c502265a8533af201e3b9538edffc115af1575bf9662508abb142e00
4
- data.tar.gz: 631c53a8c82e063df262ccac4f6c74f4591296b700ffa627110a9bf0dce40358
3
+ metadata.gz: 6e43a203c1c38131f0d9eb32b67a1859bd1530aac9249751ff0329a5901388f6
4
+ data.tar.gz: f3520762f00d2b1a2d475a9b97d2141fe8cd98faf718ccf3d8a797e61489f976
5
5
  SHA512:
6
- metadata.gz: b8c1505f7183c0a30a6d6679319a947be589828771e3bc231915cb841aad5e79b7221091862f6df537a701eaa2bf21bb996ba689e301d00a79964fd47e45f258
7
- data.tar.gz: 2ce2e0870f9455cdaeab48c840e1c9f5fb35afc340e0c56830afca90eb7bcebcf120430b7c3d043d58274d545ab0c7bb99d4062b36885b1e832bb540b7bf5080
6
+ metadata.gz: 2560aa6a3239671a14f82a95034bbad03684dd44b0fc32cb2092b404678b2ec0e124a33db64a92f5c22f4cd3d9d94a42ccc40a6ab2e1e362088d0f02f05d22f3
7
+ data.tar.gz: 305f3d463d75bc3edcfcd1b5a985d9a9882e990e850786132eb1b6d4e93f4e3e31a2c0e9aeb23141cc0633cd1d5ad8e1d88ec4c0e3e4d2d201a6298a7c8699c6
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- fbcrawl-colly (0.2.2)
4
+ fbcrawl-colly (0.2.3)
5
5
  ffi
6
6
  google-protobuf
7
7
 
@@ -201,16 +201,52 @@ func (f *Fbcolly) FetchGroupFeed(groupId int64) (error, *fbcrawl.FacebookPostLis
201
201
  err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
202
202
  }
203
203
  })
204
+ collector.OnHTML("div[role=\"article\"]", func(element *colly.HTMLElement) {
205
+ dataElement := element
206
+ post := &fbcrawl.FacebookPost{}
207
+ var fbDataFt FbDataFt
208
+ jsonData := dataElement.Attr("data-ft")
209
+
210
+ logger.Info(jsonData)
211
+ err = json.Unmarshal([]byte(jsonData), &fbDataFt)
212
+ if err != nil {
213
+ logger.Error(err)
214
+ return
215
+ }
216
+ logger.Info("Post ", fbDataFt)
217
+ post.Id = fbDataFt.TopLevelPostId
218
+ post.Group = &fbcrawl.FacebookGroup{Id: fbDataFt.PageId, Name: dataElement.DOM.Find("h3 strong:nth-child(2) a").Text()}
219
+ post.User = &fbcrawl.FacebookUser{
220
+ Id: fbDataFt.ContentOwnerIdNew,
221
+ Name: dataElement.DOM.Find("h3 strong:nth-child(1) a").Text(),
222
+ }
223
+ post.CreatedAt = fbDataFt.PageInsights[strconv.FormatInt(fbDataFt.PageId, 10)].PublishTime
224
+ //Content
204
225
 
205
- collector.OnXML("//a[text()=\"Full Story\"]", func(element *colly.XMLElement) {
206
- logger.Info("Post found at", element.Attr("href"))
207
- u, _ := url.Parse("http://mbasic.facebook.com" + element.Attr("href"))
208
- postId, _ := strconv.ParseInt(u.Query().Get("id"), 10, 64)
209
- result = append(result, &fbcrawl.FacebookPost{
210
- Id: postId,
211
- Group: &fbcrawl.FacebookGroup{Id: groupId},
212
- })
213
- //f.detailCollector.Visit(url)
226
+ //NO BACKGROUND TEXT ONLY
227
+ post.Content = strings.Join(dataElement.DOM.Find("p").Map(func(i int, selection *goquery.Selection) string {
228
+ return selection.Text()
229
+ }), "\n")
230
+
231
+ if len(post.Content) == 0 {
232
+ // TEXT WITH BACKGROUND
233
+ post.Content = dataElement.DOM.Find("div[style*=\"background-image:url\"]").Text()
234
+ }
235
+
236
+ post.ContentLink = getUrlFromRedirectHref(dataElement.DOM.Find("a[href*=\"https://lm.facebook.com/l.php\"]").AttrOr("href", ""))
237
+ post.ReactionCount = getNumberFromText(element.DOM.Find("span[id*=\"like_\"]").Text())
238
+ post.CommentCount = getNumberFromText(element.DOM.Find("span[id*=\"like_\"] ~ a").Text())
239
+ post.ContentImages = (funk.Map(fbDataFt.PhotoAttachmentsList, func(id string) *fbcrawl.FacebookImage {
240
+ i, _ := strconv.ParseInt(id, 10, 64)
241
+ return &fbcrawl.FacebookImage{
242
+ Id: i,
243
+ }
244
+ })).([]*fbcrawl.FacebookImage)
245
+
246
+ if fbDataFt.PhotoId > 0 {
247
+ post.ContentImage = &fbcrawl.FacebookImage{Id: fbDataFt.PhotoId}
248
+ }
249
+ result = append(result, post)
214
250
  })
215
251
 
216
252
  err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/groups/%d", groupId))
@@ -300,7 +336,7 @@ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.Facebo
300
336
  }
301
337
 
302
338
  post.ContentLink = getUrlFromRedirectHref(dataElement.Find("a[href*=\"https://lm.facebook.com/l.php\"]").AttrOr("href", ""))
303
- post.ReactionCount = getReactionFromText(element.DOM.Find("div[id*=\"sentence_\"]").Text())
339
+ post.ReactionCount = getNumberFromText(element.DOM.Find("div[id*=\"sentence_\"]").Text())
304
340
  post.ContentImages = (funk.Map(result.PhotoAttachmentsList, func(id string) *fbcrawl.FacebookImage {
305
341
  i, _ := strconv.ParseInt(id, 10, 64)
306
342
  return &fbcrawl.FacebookImage{
@@ -311,10 +347,6 @@ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.Facebo
311
347
  if result.PhotoId > 0 {
312
348
  post.ContentImage = &fbcrawl.FacebookImage{Id: result.PhotoId}
313
349
  }
314
-
315
- logger.Info("content", strings.Join(dataElement.Find("p").Map(func(i int, selection *goquery.Selection) string {
316
- return selection.Text()
317
- }), "\n"))
318
350
  }
319
351
 
320
352
  //Comment
@@ -375,11 +407,18 @@ func getImageIdFromHref(href string) int64 {
375
407
  return i
376
408
  }
377
409
 
378
- func getReactionFromText(text string) int64 {
410
+ func getNumberFromText(text string) int64 {
379
411
  logger.Error("reaction", text)
380
412
  if len(text) > 0 {
381
- id, _ := strconv.ParseInt(regexp.MustCompile("(\\d+)").FindStringSubmatch(text)[1], 10, 64)
382
- return id
413
+ match := regexp.MustCompile("(\\d*)\\s?([km]?)").FindStringSubmatch(text)
414
+ count, _ := strconv.ParseInt(match[1], 10, 64)
415
+ switch match[2] {
416
+ case "k":
417
+ count *= 1000
418
+ case "m":
419
+ count *= 1000000
420
+ }
421
+ return count
383
422
  }
384
423
  return 0
385
424
  }
@@ -25,6 +25,7 @@ message FacebookPost {
25
25
  repeated FacebookComment comments = 5;
26
26
  int64 created_at = 9;
27
27
  int64 reaction_count = 10;
28
+ int64 comment_count = 11;
28
29
  }
29
30
 
30
31
  message FacebookImage {
@@ -1,3 +1,3 @@
1
1
  module FbcrawlColly
2
- VERSION = "0.2.2"
2
+ VERSION = "0.2.3"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: fbcrawl-colly
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
4
+ version: 0.2.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Duy Le
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-08-13 00:00:00.000000000 Z
11
+ date: 2020-08-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ffi