fbcrawl-colly 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +3 -0
- data/Gemfile.lock +28 -0
- data/Rakefile +9 -0
- data/ext/fbcrawl_colly/extconf.rb +5 -4
- data/fbcolly/fbcolly.go +172 -30
- data/fbcrawl-colly.gemspec +2 -1
- data/fbcrawl.proto +19 -4
- data/go.mod +1 -0
- data/go.sum +7 -0
- data/lib/fbcrawl-colly.rb +1 -13
- data/lib/fbcrawl_colly/colly.rb +50 -0
- data/lib/fbcrawl_colly/ffi.rb +17 -0
- data/lib/{fbcrawl-colly → fbcrawl_colly}/version.rb +1 -1
- data/main.go +42 -9
- metadata +20 -4
- data/.travis.yml +0 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 22ec88aaf52a44344fc32ab22f6914f264b49ea10912c527c030ffd97ac12d36
|
4
|
+
data.tar.gz: d007e4326b15c64e725009548c8e9dac00e263da888d736876b6093a4fba5108
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 448bbb6b045d3b4baa3c0eae23bad85f0daf93159f514ff77e0f9383350b1c52aa3ab5360646099e56ccc19eea0bc8300ba42e67a26262e1e1f3141be61515c0
|
7
|
+
data.tar.gz: 5d326d26fb0354ea78236849be7328ce49f919435c7c373130f19439c2447f9026ce31f88bc64d5cc72581fadc618a0891cd777f885e438f01217295b58309e5
|
data/.gitignore
CHANGED
data/Gemfile.lock
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
fbcrawl-colly (0.1.1)
|
5
|
+
ffi
|
6
|
+
google-protobuf
|
7
|
+
|
8
|
+
GEM
|
9
|
+
remote: https://rubygems.org/
|
10
|
+
specs:
|
11
|
+
ffi (1.13.1)
|
12
|
+
google-protobuf (3.12.4-universal-darwin)
|
13
|
+
minitest (5.14.1)
|
14
|
+
rake (12.3.3)
|
15
|
+
rake-compiler (1.1.1)
|
16
|
+
rake
|
17
|
+
|
18
|
+
PLATFORMS
|
19
|
+
ruby
|
20
|
+
|
21
|
+
DEPENDENCIES
|
22
|
+
fbcrawl-colly!
|
23
|
+
minitest (~> 5.0)
|
24
|
+
rake (~> 12.0)
|
25
|
+
rake-compiler
|
26
|
+
|
27
|
+
BUNDLED WITH
|
28
|
+
2.1.4
|
data/Rakefile
CHANGED
@@ -7,4 +7,13 @@ Rake::TestTask.new(:test) do |t|
|
|
7
7
|
t.test_files = FileList["test/**/*_test.rb"]
|
8
8
|
end
|
9
9
|
|
10
|
+
task :fbcrawl_colly do
|
11
|
+
Dir.chdir("./ext/fbcrawl_colly/") do
|
12
|
+
require './extconf'
|
13
|
+
`make`
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
task :compile => [:fbcrawl_colly]
|
18
|
+
task :test => :compile
|
10
19
|
task :default => :test
|
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'mkmf'
|
2
|
-
|
3
|
-
MakeMakefile::find_executable '
|
4
|
-
MakeMakefile::find_executable 'protoc
|
5
|
-
|
2
|
+
requirement_passed = true
|
3
|
+
requirement_passed &&= MakeMakefile::find_executable 'go'
|
4
|
+
requirement_passed &&= MakeMakefile::find_executable 'protoc'
|
5
|
+
requirement_passed &&= MakeMakefile::find_executable 'protoc-gen-go'
|
6
|
+
$makefile_created = requirement_passed
|
data/fbcolly/fbcolly.go
CHANGED
@@ -10,14 +10,35 @@ import (
|
|
10
10
|
"github.com/gocolly/colly/extensions"
|
11
11
|
"github.com/gocolly/colly/storage"
|
12
12
|
"github.com/google/logger"
|
13
|
+
"github.com/olebedev/when"
|
14
|
+
"github.com/olebedev/when/rules/common"
|
15
|
+
"github.com/olebedev/when/rules/en"
|
16
|
+
"github.com/thoas/go-funk"
|
13
17
|
"net/url"
|
14
18
|
"qnetwork.net/fbcrawl/fbcrawl"
|
15
19
|
"regexp"
|
20
|
+
"strconv"
|
16
21
|
"strings"
|
22
|
+
"time"
|
17
23
|
)
|
18
24
|
|
19
25
|
type Fbcolly struct {
|
20
26
|
collector *colly.Collector
|
27
|
+
w *when.Parser
|
28
|
+
}
|
29
|
+
type FbDataPostContext struct {
|
30
|
+
PublishTime int64 `json:"publish_time"`
|
31
|
+
}
|
32
|
+
type FbDataInsight struct {
|
33
|
+
FbDataPostContext `json:"post_context"`
|
34
|
+
}
|
35
|
+
type FbDataFt struct {
|
36
|
+
ContentOwnerIdNew int64 `json:"content_owner_id_new"`
|
37
|
+
PhotoAttachmentsList []string `json:"photo_attachments_list"`
|
38
|
+
PhotoId int64 `json:"photo_id,string"`
|
39
|
+
PageId int64 `json:"page_id,string"`
|
40
|
+
TopLevelPostId int64 `json:"top_level_post_id,string"`
|
41
|
+
PageInsights map[string]FbDataInsight `json:"page_insights"`
|
21
42
|
}
|
22
43
|
|
23
44
|
func sharedOnRequest(request *colly.Request) {
|
@@ -79,16 +100,19 @@ func getForm(element *colly.HTMLElement, err error) (string, error, map[string]s
|
|
79
100
|
func New() *Fbcolly {
|
80
101
|
f := Fbcolly{}
|
81
102
|
f.collector = colly.NewCollector()
|
103
|
+
f.w = when.New(nil)
|
104
|
+
f.w.Add(en.All...)
|
105
|
+
f.w.Add(common.All...)
|
82
106
|
return &f
|
83
107
|
}
|
84
108
|
|
85
|
-
func (f *Fbcolly) Login(email string, password string, otp string) error {
|
109
|
+
func (f *Fbcolly) Login(email string, password string, otp string) (string, error) {
|
86
110
|
collector := f.collector.Clone()
|
87
|
-
setupSharedCollector(collector)
|
111
|
+
err := setupSharedCollector(collector)
|
88
112
|
|
89
113
|
logger.Info("Login using email", email)
|
114
|
+
loggedIn := false
|
90
115
|
|
91
|
-
var err error
|
92
116
|
collector.OnHTML("#login_form", func(element *colly.HTMLElement) {
|
93
117
|
logger.Info("OnHTML login_form")
|
94
118
|
loginURL, err, reqMap := getForm(element, err)
|
@@ -106,12 +130,13 @@ func (f *Fbcolly) Login(email string, password string, otp string) error {
|
|
106
130
|
})
|
107
131
|
|
108
132
|
collector.OnHTML("a[href=\"/login/save-device/cancel/?flow=interstitial_nux&nux_source=regular_login\"]", func(element *colly.HTMLElement) {
|
109
|
-
collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
|
133
|
+
err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
|
110
134
|
})
|
111
135
|
|
112
136
|
collector.OnHTML("form[action=\"/login/checkpoint/\"]", func(element *colly.HTMLElement) {
|
113
137
|
|
114
138
|
checkpointUrl, err, reqMap := getForm(element, err)
|
139
|
+
shouldSubmit := false
|
115
140
|
if err != nil {
|
116
141
|
logger.Error(err)
|
117
142
|
return
|
@@ -121,6 +146,7 @@ func (f *Fbcolly) Login(email string, password string, otp string) error {
|
|
121
146
|
//Save Device
|
122
147
|
logger.Info("OnHTML Save Device checkpoint")
|
123
148
|
reqMap["name_action_selected"] = "dont_save"
|
149
|
+
shouldSubmit = true
|
124
150
|
} else if element.DOM.Find("input[name=\"approvals_code\"]").Length() > 0 {
|
125
151
|
logger.Info("OnHTML OTP checkpoint")
|
126
152
|
//logger.Info("Please input OTP")
|
@@ -128,11 +154,15 @@ func (f *Fbcolly) Login(email string, password string, otp string) error {
|
|
128
154
|
//code, _ := reader.ReadString('\n')
|
129
155
|
code := otp[0:6]
|
130
156
|
reqMap["approvals_code"] = code
|
157
|
+
shouldSubmit = true
|
131
158
|
} else {
|
132
159
|
logger.Info("OnHTML Only Continue checkpoint")
|
160
|
+
|
161
|
+
}
|
162
|
+
if shouldSubmit {
|
163
|
+
logger.Info("req map:", reqMap)
|
164
|
+
err = collector.Post(checkpointUrl, reqMap)
|
133
165
|
}
|
134
|
-
logger.Info("req map:", reqMap)
|
135
|
-
err = collector.Post(checkpointUrl, reqMap)
|
136
166
|
if err != nil {
|
137
167
|
logger.Error("post err:", err)
|
138
168
|
}
|
@@ -141,19 +171,24 @@ func (f *Fbcolly) Login(email string, password string, otp string) error {
|
|
141
171
|
collector.OnHTML("form[action=\"/search/\"]", func(element *colly.HTMLElement) {
|
142
172
|
//We're in home
|
143
173
|
logger.Info("I'm IN HOME, navigate to page now")
|
174
|
+
loggedIn = true
|
144
175
|
})
|
145
176
|
|
146
177
|
err = collector.Visit("https://mbasic.facebook.com/")
|
147
178
|
if err != nil {
|
148
179
|
logger.Error("crawl by colly err:", err)
|
149
180
|
}
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
181
|
+
|
182
|
+
if loggedIn {
|
183
|
+
logger.Info(storage.StringifyCookies(collector.Cookies("https://mbasic.facebook.com/")))
|
184
|
+
return storage.StringifyCookies(collector.Cookies("https://mbasic.facebook.com/")), err
|
185
|
+
} else {
|
186
|
+
return "", err
|
187
|
+
}
|
188
|
+
|
154
189
|
}
|
155
190
|
|
156
|
-
func (f *Fbcolly) FetchGroupFeed(groupId
|
191
|
+
func (f *Fbcolly) FetchGroupFeed(groupId int64) (error, *fbcrawl.FacebookPostList) {
|
157
192
|
collector := f.collector.Clone()
|
158
193
|
err := setupSharedCollector(collector)
|
159
194
|
currentPage := 1
|
@@ -163,72 +198,179 @@ func (f *Fbcolly) FetchGroupFeed(groupId string) (error, *fbcrawl.FacebookPostLi
|
|
163
198
|
currentPage++
|
164
199
|
if currentPage < 3 {
|
165
200
|
logger.Info("Will fetch page", currentPage)
|
166
|
-
collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
|
201
|
+
err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
|
167
202
|
}
|
168
203
|
})
|
169
204
|
|
170
|
-
//TODO: May not need this
|
171
205
|
collector.OnXML("//a[text()=\"Full Story\"]", func(element *colly.XMLElement) {
|
206
|
+
logger.Info("Post found at", element.Attr("href"))
|
172
207
|
u, _ := url.Parse("http://mbasic.facebook.com" + element.Attr("href"))
|
173
|
-
|
208
|
+
postId, _ := strconv.ParseInt(u.Query().Get("id"), 10, 64)
|
174
209
|
result = append(result, &fbcrawl.FacebookPost{
|
175
|
-
Id:
|
210
|
+
Id: postId,
|
176
211
|
Group: &fbcrawl.FacebookGroup{Id: groupId},
|
177
212
|
})
|
178
213
|
//f.detailCollector.Visit(url)
|
179
214
|
})
|
180
215
|
|
181
|
-
err = collector.Visit("https://mbasic.facebook.com/groups
|
216
|
+
err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/groups/%d", groupId))
|
182
217
|
if err != nil {
|
183
218
|
logger.Error("crawl by colly err:", err)
|
184
219
|
}
|
185
220
|
return err, &fbcrawl.FacebookPostList{Posts: result}
|
186
221
|
}
|
187
222
|
|
188
|
-
func (f *Fbcolly)
|
223
|
+
func (f *Fbcolly) FetchContentImages(postId int64) (error, *fbcrawl.FacebookImageList) {
|
224
|
+
collector := f.collector.Clone()
|
225
|
+
err := setupSharedCollector(collector)
|
226
|
+
currentPage := 1
|
227
|
+
var result []*fbcrawl.FacebookImage
|
228
|
+
|
229
|
+
collector.OnHTML("a[href*=\"/media/set/\"]", func(element *colly.HTMLElement) {
|
230
|
+
currentPage++
|
231
|
+
logger.Info("Will fetch page", currentPage)
|
232
|
+
err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
|
233
|
+
})
|
234
|
+
|
235
|
+
collector.OnHTML("a[href*=\"/photo.php\"]", func(element *colly.HTMLElement) {
|
236
|
+
result = append(result, &fbcrawl.FacebookImage{
|
237
|
+
Id: getImageIdFromHref(element.Attr("href")),
|
238
|
+
})
|
239
|
+
//f.detailCollector.Visit(url)
|
240
|
+
})
|
241
|
+
|
242
|
+
err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/media/set/?set=pcb.%d", postId))
|
243
|
+
if err != nil {
|
244
|
+
logger.Error("crawl by colly err:", err)
|
245
|
+
}
|
246
|
+
return err, &fbcrawl.FacebookImageList{Images: result}
|
247
|
+
}
|
248
|
+
|
249
|
+
func (f *Fbcolly) FetchImageUrl(imageId int64) (error, *fbcrawl.FacebookImage) {
|
189
250
|
collector := f.collector.Clone()
|
190
251
|
err := setupSharedCollector(collector)
|
191
|
-
|
252
|
+
result := fbcrawl.FacebookImage{Id: imageId}
|
253
|
+
|
254
|
+
collector.OnHTML("a", func(element *colly.HTMLElement) {
|
255
|
+
result.Url = element.Attr("href")
|
256
|
+
})
|
257
|
+
|
258
|
+
err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/photo/view_full_size/?fbid=%d", imageId))
|
259
|
+
if err != nil {
|
260
|
+
logger.Error("crawl by colly err:", err)
|
261
|
+
}
|
262
|
+
return err, &result
|
263
|
+
}
|
264
|
+
|
265
|
+
func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.FacebookPost) {
|
266
|
+
collector := f.collector.Clone()
|
267
|
+
err := setupSharedCollector(collector)
|
268
|
+
post := &fbcrawl.FacebookPost{Comments: []*fbcrawl.FacebookComment{}}
|
269
|
+
commentPaging := 0
|
192
270
|
collector.OnHTML("#m_story_permalink_view", func(element *colly.HTMLElement) {
|
193
271
|
dataElement := element.DOM.Find("div[data-ft]")
|
194
272
|
if dataElement.Length() > 0 {
|
195
|
-
var result
|
273
|
+
var result FbDataFt
|
196
274
|
jsonData, isExist := dataElement.Attr("data-ft")
|
197
275
|
if isExist {
|
198
|
-
|
276
|
+
logger.Info(jsonData)
|
277
|
+
err = json.Unmarshal([]byte(jsonData), &result)
|
278
|
+
if err != nil {
|
279
|
+
logger.Error(err)
|
280
|
+
return
|
281
|
+
}
|
199
282
|
logger.Info("Post ", result)
|
200
|
-
post.Id = result
|
201
|
-
post.Group = &fbcrawl.FacebookGroup{Id: result
|
202
|
-
post.User = &fbcrawl.FacebookUser{
|
283
|
+
post.Id = result.TopLevelPostId
|
284
|
+
post.Group = &fbcrawl.FacebookGroup{Id: result.PageId, Name: dataElement.Find("h3 strong:last-child a").Text()}
|
285
|
+
post.User = &fbcrawl.FacebookUser{
|
286
|
+
Id: result.ContentOwnerIdNew,
|
287
|
+
Name: dataElement.Find("h3 strong:first-child a").Text(),
|
288
|
+
}
|
289
|
+
post.CreatedAt = result.PageInsights[strconv.FormatInt(result.PageId, 10)].PublishTime
|
203
290
|
//Content
|
291
|
+
|
292
|
+
//NO BACKGROUND TEXT ONLY
|
204
293
|
post.Content = strings.Join(dataElement.Find("p").Map(func(i int, selection *goquery.Selection) string {
|
205
294
|
return selection.Text()
|
206
295
|
}), "\n")
|
207
296
|
|
297
|
+
if len(post.Content) == 0 {
|
298
|
+
// TEXT WITH BACKGROUND
|
299
|
+
post.Content = dataElement.Find("div[style*=\"background-image:url\"]").Text()
|
300
|
+
}
|
301
|
+
|
302
|
+
post.ContentLink = getUrlFromRedirectHref(dataElement.Find("a[href*=\"https://lm.facebook.com/l.php\"]").AttrOr("href", ""))
|
303
|
+
|
304
|
+
post.ContentImages = (funk.Map(result.PhotoAttachmentsList, func(id string) *fbcrawl.FacebookImage {
|
305
|
+
i, _ := strconv.ParseInt(id, 10, 64)
|
306
|
+
return &fbcrawl.FacebookImage{
|
307
|
+
Id: i,
|
308
|
+
}
|
309
|
+
})).([]*fbcrawl.FacebookImage)
|
310
|
+
|
311
|
+
if result.PhotoId > 0 {
|
312
|
+
post.ContentImage = &fbcrawl.FacebookImage{Id: result.PhotoId}
|
313
|
+
}
|
314
|
+
|
208
315
|
logger.Info("content", strings.Join(dataElement.Find("p").Map(func(i int, selection *goquery.Selection) string {
|
209
316
|
return selection.Text()
|
210
317
|
}), "\n"))
|
211
318
|
}
|
212
|
-
|
319
|
+
|
213
320
|
//Comment
|
214
321
|
element.DOM.Find("h3 + div + div + div").Parent().Parent().Each(func(i int, selection *goquery.Selection) {
|
215
322
|
//author
|
216
|
-
commentId := selection.AttrOr("id", "")
|
323
|
+
commentId, _ := strconv.ParseInt(selection.AttrOr("id", ""), 10, 64)
|
217
324
|
logger.Info("comment", commentId)
|
218
|
-
|
219
|
-
//idRegex.FindString()
|
325
|
+
createdAtWhenResult, _ := f.w.Parse(selection.Find("abbr").Text(), time.Now())
|
220
326
|
post.Comments = append(post.Comments, &fbcrawl.FacebookComment{
|
221
327
|
Id: commentId,
|
222
328
|
Post: &fbcrawl.FacebookPost{Id: post.Id},
|
223
329
|
User: &fbcrawl.FacebookUser{
|
224
|
-
Id:
|
330
|
+
Id: getUserIdFromCommentHref(selection.Find("a[href*=\"#comment_form_\"]").AttrOr("href", "")),
|
225
331
|
Name: selection.Find("h3 > a").Text(),
|
226
332
|
},
|
227
|
-
Content:
|
333
|
+
Content: selection.Find("h3 + div").Text(),
|
334
|
+
CreatedAt: createdAtWhenResult.Time.Unix(),
|
228
335
|
})
|
229
336
|
})
|
337
|
+
|
338
|
+
}
|
339
|
+
})
|
340
|
+
|
341
|
+
collector.OnHTML("div[id*=\"see_prev_\"] > a", func(element *colly.HTMLElement) {
|
342
|
+
if commentPaging < 3 {
|
343
|
+
logger.Info("Comment paging", commentPaging)
|
344
|
+
err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
|
345
|
+
commentPaging = commentPaging + 1
|
230
346
|
}
|
231
347
|
})
|
232
|
-
|
348
|
+
|
349
|
+
err = collector.Visit(fmt.Sprintf("http://mbasic.facebook.com/groups/%d?view=permalink&id=%d&_rdr", groupId, postId))
|
233
350
|
return err, post
|
234
351
|
}
|
352
|
+
|
353
|
+
func (f *Fbcolly) LoginWithCookies(cookies string) error {
|
354
|
+
collector := f.collector
|
355
|
+
return collector.SetCookies("https://mbasic.facebook.com/", storage.UnstringifyCookies(cookies))
|
356
|
+
}
|
357
|
+
|
358
|
+
//func getUsernameFromHref(href string) string {
|
359
|
+
// return regexp.MustCompile("/([\\d\\w.]+).*").FindStringSubmatch(href)[1]
|
360
|
+
//}
|
361
|
+
|
362
|
+
func getUserIdFromCommentHref(href string) int64 {
|
363
|
+
id, _ := strconv.ParseInt(regexp.MustCompile("#comment_form_(\\d+)").FindStringSubmatch(href)[1], 10, 64)
|
364
|
+
return id
|
365
|
+
}
|
366
|
+
|
367
|
+
func getUrlFromRedirectHref(href string) string {
|
368
|
+
u, _ := url.Parse(href)
|
369
|
+
return u.Query().Get("u")
|
370
|
+
}
|
371
|
+
|
372
|
+
func getImageIdFromHref(href string) int64 {
|
373
|
+
u, _ := url.Parse(href)
|
374
|
+
i, _ := strconv.ParseInt(u.Query().Get("fbid"), 10, 64)
|
375
|
+
return i
|
376
|
+
}
|
data/fbcrawl-colly.gemspec
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
require_relative 'lib/
|
1
|
+
require_relative 'lib/fbcrawl_colly/version'
|
2
2
|
|
3
3
|
Gem::Specification.new do |spec|
|
4
4
|
spec.name = "fbcrawl-colly"
|
@@ -32,4 +32,5 @@ Gem::Specification.new do |spec|
|
|
32
32
|
|
33
33
|
spec.add_runtime_dependency 'ffi'
|
34
34
|
spec.add_runtime_dependency 'google-protobuf'
|
35
|
+
spec.add_development_dependency 'rake-compiler'
|
35
36
|
end
|
data/fbcrawl.proto
CHANGED
@@ -1,33 +1,48 @@
|
|
1
1
|
syntax = "proto3";
|
2
2
|
|
3
|
+
package fbcrawl_colly;
|
3
4
|
option go_package = "./fbcrawl;fbcrawl";
|
4
5
|
|
5
6
|
// The request message containing the user's name.
|
6
7
|
message FacebookGroup {
|
7
|
-
|
8
|
+
int64 id = 1;
|
8
9
|
string name = 2;
|
9
10
|
}
|
10
11
|
|
11
12
|
message FacebookUser {
|
12
|
-
|
13
|
+
int64 id = 1;
|
13
14
|
string name = 2;
|
14
15
|
}
|
15
16
|
|
16
17
|
message FacebookPost {
|
17
|
-
|
18
|
+
int64 id = 1;
|
18
19
|
FacebookGroup group = 2;
|
19
20
|
FacebookUser user = 3;
|
20
21
|
string content = 4;
|
22
|
+
string content_link = 6;
|
23
|
+
FacebookImage content_image = 8;
|
24
|
+
repeated FacebookImage content_images = 7;
|
21
25
|
repeated FacebookComment comments = 5;
|
26
|
+
int64 created_at = 9;
|
27
|
+
}
|
28
|
+
|
29
|
+
message FacebookImage {
|
30
|
+
int64 id = 1;
|
31
|
+
string url = 2;
|
22
32
|
}
|
23
33
|
|
24
34
|
message FacebookComment {
|
25
|
-
|
35
|
+
int64 id = 1;
|
26
36
|
FacebookPost post = 2;
|
27
37
|
FacebookUser user = 3;
|
28
38
|
string content = 4;
|
39
|
+
int64 created_at = 5;
|
29
40
|
}
|
30
41
|
|
31
42
|
message FacebookPostList {
|
32
43
|
repeated FacebookPost posts = 1;
|
33
44
|
}
|
45
|
+
|
46
|
+
message FacebookImageList {
|
47
|
+
repeated FacebookImage images = 1;
|
48
|
+
}
|
data/go.mod
CHANGED
@@ -13,6 +13,7 @@ require (
|
|
13
13
|
github.com/golang/protobuf v1.4.2
|
14
14
|
github.com/google/logger v1.1.0
|
15
15
|
github.com/kennygrant/sanitize v1.2.4 // indirect
|
16
|
+
github.com/olebedev/when v0.0.0-20190311101825-c3b538a97254
|
16
17
|
github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca // indirect
|
17
18
|
github.com/temoto/robotstxt v1.1.1 // indirect
|
18
19
|
github.com/thoas/go-funk v0.7.0
|
data/go.sum
CHANGED
@@ -1,4 +1,6 @@
|
|
1
1
|
cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
|
2
|
+
github.com/AlekSi/pointer v1.0.0 h1:KWCWzsvFxNLcmM5XmiqHsGTTsuwZMsLFwWF9Y+//bNE=
|
3
|
+
github.com/AlekSi/pointer v1.0.0/go.mod h1:1kjywbfcPFCmncIxtk6fIEub6LKrfMz3gc5QKVOSOA8=
|
2
4
|
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
|
3
5
|
github.com/PuerkitoBio/goquery v1.5.1 h1:PSPBGne8NIUWw+/7vFBV+kG2J/5MOjbzc7154OaKCSE=
|
4
6
|
github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc=
|
@@ -18,6 +20,7 @@ github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA
|
|
18
20
|
github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
|
19
21
|
github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8=
|
20
22
|
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
23
|
+
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
21
24
|
github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
|
22
25
|
github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c=
|
23
26
|
github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y=
|
@@ -50,6 +53,10 @@ github.com/google/logger v1.1.0 h1:saB74Etb4EAJNH3z74CVbCKk75hld/8T0CsXKetWCwM=
|
|
50
53
|
github.com/google/logger v1.1.0/go.mod h1:w7O8nrRr0xufejBlQMI83MXqRusvREoJdaAxV+CoAB4=
|
51
54
|
github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8NzpJ3o=
|
52
55
|
github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak=
|
56
|
+
github.com/olebedev/when v0.0.0-20190311101825-c3b538a97254 h1:JYoQR67E1vv1WGoeW8DkdFs7vrIEe/5wP+qJItd5tUE=
|
57
|
+
github.com/olebedev/when v0.0.0-20190311101825-c3b538a97254/go.mod h1:DPucAeQGDPUzYUt+NaWw6qsF5SFapWWToxEiVDh2aV0=
|
58
|
+
github.com/pkg/errors v0.8.1 h1:iURUrRGxPUNPdy5/HRSm+Yj6okJ6UtLINN0Q9M4+h3I=
|
59
|
+
github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
|
53
60
|
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
54
61
|
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
55
62
|
github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
|
data/lib/fbcrawl-colly.rb
CHANGED
@@ -1,16 +1,4 @@
|
|
1
|
-
require 'ffi'
|
2
|
-
require 'fbcrawl_pb'
|
3
|
-
module FbcrawlColly
|
4
|
-
extend FFI::Library
|
5
1
|
|
6
|
-
|
7
|
-
attach_function :free, [ :pointer ], :void
|
2
|
+
module FbcrawlColly
|
8
3
|
|
9
|
-
attach_function :Init, [], :pointer
|
10
|
-
attach_function :Login, [:pointer, :string, :string], :void
|
11
|
-
attach_function :FetchGroupFeed, [:pointer, :string], :string
|
12
|
-
attach_function :FetchPost, [:pointer, :string, :string], :string
|
13
|
-
# attach_function :FetchGroup, [:pointer, :string], :pointer
|
14
|
-
# attach_function :Login, [:pointer, :string, :string, :string], :void
|
15
|
-
# attach_function :FreePointer, [:pointer], :void
|
16
4
|
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
require 'ffi'
|
2
|
+
require_relative '../fbcrawl_pb'
|
3
|
+
require_relative './ffi'
|
4
|
+
|
5
|
+
module FbcrawlColly
|
6
|
+
class Colly
|
7
|
+
def initialize
|
8
|
+
super
|
9
|
+
@colly = ::FFI::AutoPointer.new(FbcrawlColly::FFI::Init(), FbcrawlColly::FFI.method(:FreeColly))
|
10
|
+
end
|
11
|
+
|
12
|
+
def login(email, password)
|
13
|
+
s, ptr = FbcrawlColly::FFI.Login(@colly, email, password)
|
14
|
+
FbcrawlColly::FFI.free(ptr)
|
15
|
+
s
|
16
|
+
end
|
17
|
+
|
18
|
+
def login_with_cookies(cookies)
|
19
|
+
FbcrawlColly::FFI.LoginWithCookies(@colly, cookies)
|
20
|
+
end
|
21
|
+
|
22
|
+
def fetch_group_feed(group_id)
|
23
|
+
s, ptr = FbcrawlColly::FFI.FetchGroupFeed(@colly, group_id)
|
24
|
+
list = FbcrawlColly::FacebookPostList.decode(s)
|
25
|
+
FbcrawlColly::FFI.free(ptr)
|
26
|
+
list
|
27
|
+
end
|
28
|
+
|
29
|
+
def fetch_post(group_id, post_id)
|
30
|
+
s, ptr = FbcrawlColly::FFI.FetchPost(@colly, group_id, post_id)
|
31
|
+
post = FbcrawlColly::FacebookPost.decode(s)
|
32
|
+
FbcrawlColly::FFI.free(ptr)
|
33
|
+
post
|
34
|
+
end
|
35
|
+
|
36
|
+
def fetch_content_images(post_id)
|
37
|
+
s, ptr = FbcrawlColly::FFI.FetchContentImages(@colly, post_id)
|
38
|
+
imageList = FbcrawlColly::FacebookImageList.decode(s)
|
39
|
+
FbcrawlColly::FFI.free(ptr)
|
40
|
+
imageList
|
41
|
+
end
|
42
|
+
|
43
|
+
def fetch_image_url(image_id)
|
44
|
+
s, ptr = FbcrawlColly::FFI.FetchImageUrl(@colly, image_id)
|
45
|
+
image = FbcrawlColly::FacebookImage.decode(s)
|
46
|
+
FbcrawlColly::FFI.free(ptr)
|
47
|
+
image
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
require 'ffi'
|
2
|
+
module FbcrawlColly::FFI
|
3
|
+
extend FFI::Library
|
4
|
+
|
5
|
+
ffi_lib File.expand_path("../../ext/fbcrawl_colly/fbcolly.so", File.dirname(__FILE__))
|
6
|
+
attach_function :free, [ :pointer ], :void
|
7
|
+
|
8
|
+
attach_function :Init, [], :pointer
|
9
|
+
attach_function :FreeColly, [:pointer], :pointer
|
10
|
+
attach_function :Login, [:pointer, :string, :string], :strptr
|
11
|
+
attach_function :LoginWithCookies, [:pointer, :string], :void
|
12
|
+
attach_function :FetchGroupFeed, [:pointer, :int64], :strptr
|
13
|
+
attach_function :FetchPost, [:pointer, :int64, :int64], :strptr
|
14
|
+
attach_function :FetchContentImages, [:pointer, :int64], :strptr
|
15
|
+
attach_function :FetchImageUrl, [:pointer, :int64], :strptr
|
16
|
+
# attach_function :FetchGroup, [:pointer, :string], :pointer
|
17
|
+
end
|
data/main.go
CHANGED
@@ -27,36 +27,69 @@ var password = flag.String("password", "change_me", "facebook password")
|
|
27
27
|
var otp = flag.String("otp", "123456", "facebook otp")
|
28
28
|
var groupId = flag.String("groupId", "334294967318328", "facebook group id, default is 334294967318328")
|
29
29
|
|
30
|
-
var
|
30
|
+
var allInstances = map[uintptr]*fbcolly.Fbcolly{}
|
31
31
|
|
32
32
|
//export Init
|
33
33
|
func Init() uintptr {
|
34
|
-
|
34
|
+
instance := fbcolly.New()
|
35
|
+
ptr := (uintptr)(unsafe.Pointer(instance))
|
36
|
+
allInstances[ptr] = instance
|
37
|
+
return ptr
|
38
|
+
}
|
39
|
+
|
40
|
+
//export FreeColly
|
41
|
+
func FreeColly(pointer unsafe.Pointer) {
|
42
|
+
delete(allInstances, uintptr(pointer))
|
35
43
|
}
|
36
44
|
|
37
45
|
//export Login
|
38
|
-
func Login(pointer unsafe.Pointer, email *C.char, password *C.char) {
|
46
|
+
func Login(pointer unsafe.Pointer, email *C.char, password *C.char) *C.char {
|
47
|
+
p := (*fbcolly.Fbcolly)(pointer)
|
48
|
+
cookies, err := p.Login(C.GoString(email), C.GoString(password), "")
|
49
|
+
if err == nil {
|
50
|
+
return C.CString(cookies)
|
51
|
+
}
|
52
|
+
return nil
|
53
|
+
}
|
54
|
+
|
55
|
+
//export LoginWithCookies
|
56
|
+
func LoginWithCookies(pointer unsafe.Pointer, cookies *C.char) {
|
39
57
|
p := (*fbcolly.Fbcolly)(pointer)
|
40
|
-
|
41
|
-
p.Login(C.GoString(email), C.GoString(password), "")
|
58
|
+
p.LoginWithCookies(C.GoString(cookies))
|
42
59
|
}
|
43
60
|
|
44
61
|
//export FetchGroupFeed
|
45
|
-
func FetchGroupFeed(pointer unsafe.Pointer, groupId
|
62
|
+
func FetchGroupFeed(pointer unsafe.Pointer, groupId int64) unsafe.Pointer {
|
46
63
|
p := (*fbcolly.Fbcolly)(pointer)
|
47
|
-
_, postsList := p.FetchGroupFeed(
|
64
|
+
_, postsList := p.FetchGroupFeed(groupId)
|
48
65
|
marshaledPostsList, _ := proto.Marshal(postsList)
|
49
66
|
return C.CBytes(append(marshaledPostsList, 0))
|
50
67
|
}
|
51
68
|
|
52
69
|
//export FetchPost
|
53
|
-
func FetchPost(pointer unsafe.Pointer, groupId
|
70
|
+
func FetchPost(pointer unsafe.Pointer, groupId int64, postId int64) unsafe.Pointer {
|
54
71
|
p := (*fbcolly.Fbcolly)(pointer)
|
55
|
-
_, post := p.FetchPost(
|
72
|
+
_, post := p.FetchPost(groupId, postId)
|
56
73
|
marshaledPost, _ := proto.Marshal(post)
|
57
74
|
return C.CBytes(append(marshaledPost, 0))
|
58
75
|
}
|
59
76
|
|
77
|
+
//export FetchContentImages
|
78
|
+
func FetchContentImages(pointer unsafe.Pointer, postId int64) unsafe.Pointer {
|
79
|
+
p := (*fbcolly.Fbcolly)(pointer)
|
80
|
+
_, imageList := p.FetchContentImages(postId)
|
81
|
+
marshaled, _ := proto.Marshal(imageList)
|
82
|
+
return C.CBytes(append(marshaled, 0))
|
83
|
+
}
|
84
|
+
|
85
|
+
//export FetchImageUrl
|
86
|
+
func FetchImageUrl(pointer unsafe.Pointer, imageId int64) unsafe.Pointer {
|
87
|
+
p := (*fbcolly.Fbcolly)(pointer)
|
88
|
+
_, image := p.FetchImageUrl(imageId)
|
89
|
+
marshaled, _ := proto.Marshal(image)
|
90
|
+
return C.CBytes(append(marshaled, 0))
|
91
|
+
}
|
92
|
+
|
60
93
|
func main() {
|
61
94
|
//r := regexp.MustCompile("/([\\d\\w.]+)").FindStringSubmatch()[1]
|
62
95
|
//print(r.FindStringSubmatch("/liem.phamthanh.161?refid=18&__tn__=R")[1])
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fbcrawl-colly
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Duy Le
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-08-
|
11
|
+
date: 2020-08-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ffi
|
@@ -38,6 +38,20 @@ dependencies:
|
|
38
38
|
- - ">="
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rake-compiler
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
41
55
|
description: Crawl mbasic.facebook.com using GO Colly
|
42
56
|
email:
|
43
57
|
- duyleekun@gmail.com
|
@@ -47,9 +61,9 @@ extensions:
|
|
47
61
|
extra_rdoc_files: []
|
48
62
|
files:
|
49
63
|
- ".gitignore"
|
50
|
-
- ".travis.yml"
|
51
64
|
- CODE_OF_CONDUCT.md
|
52
65
|
- Gemfile
|
66
|
+
- Gemfile.lock
|
53
67
|
- LICENSE.txt
|
54
68
|
- README.md
|
55
69
|
- Rakefile
|
@@ -64,7 +78,9 @@ files:
|
|
64
78
|
- go.mod
|
65
79
|
- go.sum
|
66
80
|
- lib/fbcrawl-colly.rb
|
67
|
-
- lib/
|
81
|
+
- lib/fbcrawl_colly/colly.rb
|
82
|
+
- lib/fbcrawl_colly/ffi.rb
|
83
|
+
- lib/fbcrawl_colly/version.rb
|
68
84
|
- main.go
|
69
85
|
homepage: http://github.com/duyleekun/fbcrawl-colly
|
70
86
|
licenses:
|