fbcrawl-colly 0.2.2 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/fbcolly/fbcolly.go +56 -17
- data/fbcrawl.proto +1 -0
- data/lib/fbcrawl_colly/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6e43a203c1c38131f0d9eb32b67a1859bd1530aac9249751ff0329a5901388f6
|
4
|
+
data.tar.gz: f3520762f00d2b1a2d475a9b97d2141fe8cd98faf718ccf3d8a797e61489f976
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2560aa6a3239671a14f82a95034bbad03684dd44b0fc32cb2092b404678b2ec0e124a33db64a92f5c22f4cd3d9d94a42ccc40a6ab2e1e362088d0f02f05d22f3
|
7
|
+
data.tar.gz: 305f3d463d75bc3edcfcd1b5a985d9a9882e990e850786132eb1b6d4e93f4e3e31a2c0e9aeb23141cc0633cd1d5ad8e1d88ec4c0e3e4d2d201a6298a7c8699c6
|
data/Gemfile.lock
CHANGED
data/fbcolly/fbcolly.go
CHANGED
@@ -201,16 +201,52 @@ func (f *Fbcolly) FetchGroupFeed(groupId int64) (error, *fbcrawl.FacebookPostLis
|
|
201
201
|
err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
|
202
202
|
}
|
203
203
|
})
|
204
|
+
collector.OnHTML("div[role=\"article\"]", func(element *colly.HTMLElement) {
|
205
|
+
dataElement := element
|
206
|
+
post := &fbcrawl.FacebookPost{}
|
207
|
+
var fbDataFt FbDataFt
|
208
|
+
jsonData := dataElement.Attr("data-ft")
|
209
|
+
|
210
|
+
logger.Info(jsonData)
|
211
|
+
err = json.Unmarshal([]byte(jsonData), &fbDataFt)
|
212
|
+
if err != nil {
|
213
|
+
logger.Error(err)
|
214
|
+
return
|
215
|
+
}
|
216
|
+
logger.Info("Post ", fbDataFt)
|
217
|
+
post.Id = fbDataFt.TopLevelPostId
|
218
|
+
post.Group = &fbcrawl.FacebookGroup{Id: fbDataFt.PageId, Name: dataElement.DOM.Find("h3 strong:nth-child(2) a").Text()}
|
219
|
+
post.User = &fbcrawl.FacebookUser{
|
220
|
+
Id: fbDataFt.ContentOwnerIdNew,
|
221
|
+
Name: dataElement.DOM.Find("h3 strong:nth-child(1) a").Text(),
|
222
|
+
}
|
223
|
+
post.CreatedAt = fbDataFt.PageInsights[strconv.FormatInt(fbDataFt.PageId, 10)].PublishTime
|
224
|
+
//Content
|
204
225
|
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
226
|
+
//NO BACKGROUND TEXT ONLY
|
227
|
+
post.Content = strings.Join(dataElement.DOM.Find("p").Map(func(i int, selection *goquery.Selection) string {
|
228
|
+
return selection.Text()
|
229
|
+
}), "\n")
|
230
|
+
|
231
|
+
if len(post.Content) == 0 {
|
232
|
+
// TEXT WITH BACKGROUND
|
233
|
+
post.Content = dataElement.DOM.Find("div[style*=\"background-image:url\"]").Text()
|
234
|
+
}
|
235
|
+
|
236
|
+
post.ContentLink = getUrlFromRedirectHref(dataElement.DOM.Find("a[href*=\"https://lm.facebook.com/l.php\"]").AttrOr("href", ""))
|
237
|
+
post.ReactionCount = getNumberFromText(element.DOM.Find("span[id*=\"like_\"]").Text())
|
238
|
+
post.CommentCount = getNumberFromText(element.DOM.Find("span[id*=\"like_\"] ~ a").Text())
|
239
|
+
post.ContentImages = (funk.Map(fbDataFt.PhotoAttachmentsList, func(id string) *fbcrawl.FacebookImage {
|
240
|
+
i, _ := strconv.ParseInt(id, 10, 64)
|
241
|
+
return &fbcrawl.FacebookImage{
|
242
|
+
Id: i,
|
243
|
+
}
|
244
|
+
})).([]*fbcrawl.FacebookImage)
|
245
|
+
|
246
|
+
if fbDataFt.PhotoId > 0 {
|
247
|
+
post.ContentImage = &fbcrawl.FacebookImage{Id: fbDataFt.PhotoId}
|
248
|
+
}
|
249
|
+
result = append(result, post)
|
214
250
|
})
|
215
251
|
|
216
252
|
err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/groups/%d", groupId))
|
@@ -300,7 +336,7 @@ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.Facebo
|
|
300
336
|
}
|
301
337
|
|
302
338
|
post.ContentLink = getUrlFromRedirectHref(dataElement.Find("a[href*=\"https://lm.facebook.com/l.php\"]").AttrOr("href", ""))
|
303
|
-
post.ReactionCount =
|
339
|
+
post.ReactionCount = getNumberFromText(element.DOM.Find("div[id*=\"sentence_\"]").Text())
|
304
340
|
post.ContentImages = (funk.Map(result.PhotoAttachmentsList, func(id string) *fbcrawl.FacebookImage {
|
305
341
|
i, _ := strconv.ParseInt(id, 10, 64)
|
306
342
|
return &fbcrawl.FacebookImage{
|
@@ -311,10 +347,6 @@ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.Facebo
|
|
311
347
|
if result.PhotoId > 0 {
|
312
348
|
post.ContentImage = &fbcrawl.FacebookImage{Id: result.PhotoId}
|
313
349
|
}
|
314
|
-
|
315
|
-
logger.Info("content", strings.Join(dataElement.Find("p").Map(func(i int, selection *goquery.Selection) string {
|
316
|
-
return selection.Text()
|
317
|
-
}), "\n"))
|
318
350
|
}
|
319
351
|
|
320
352
|
//Comment
|
@@ -375,11 +407,18 @@ func getImageIdFromHref(href string) int64 {
|
|
375
407
|
return i
|
376
408
|
}
|
377
409
|
|
378
|
-
func
|
410
|
+
func getNumberFromText(text string) int64 {
|
379
411
|
logger.Error("reaction", text)
|
380
412
|
if len(text) > 0 {
|
381
|
-
|
382
|
-
|
413
|
+
match := regexp.MustCompile("(\\d*)\\s?([km]?)").FindStringSubmatch(text)
|
414
|
+
count, _ := strconv.ParseInt(match[1], 10, 64)
|
415
|
+
switch match[2] {
|
416
|
+
case "k":
|
417
|
+
count *= 1000
|
418
|
+
case "m":
|
419
|
+
count *= 1000000
|
420
|
+
}
|
421
|
+
return count
|
383
422
|
}
|
384
423
|
return 0
|
385
424
|
}
|
data/fbcrawl.proto
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fbcrawl-colly
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Duy Le
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-08-
|
11
|
+
date: 2020-08-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ffi
|