fbcrawl-colly 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +3 -0
- data/Gemfile.lock +28 -0
- data/Rakefile +9 -0
- data/ext/fbcrawl_colly/extconf.rb +5 -4
- data/fbcolly/fbcolly.go +172 -30
- data/fbcrawl-colly.gemspec +2 -1
- data/fbcrawl.proto +19 -4
- data/go.mod +1 -0
- data/go.sum +7 -0
- data/lib/fbcrawl-colly.rb +1 -13
- data/lib/fbcrawl_colly/colly.rb +50 -0
- data/lib/fbcrawl_colly/ffi.rb +17 -0
- data/lib/{fbcrawl-colly → fbcrawl_colly}/version.rb +1 -1
- data/main.go +42 -9
- metadata +20 -4
- data/.travis.yml +0 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 22ec88aaf52a44344fc32ab22f6914f264b49ea10912c527c030ffd97ac12d36
|
4
|
+
data.tar.gz: d007e4326b15c64e725009548c8e9dac00e263da888d736876b6093a4fba5108
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 448bbb6b045d3b4baa3c0eae23bad85f0daf93159f514ff77e0f9383350b1c52aa3ab5360646099e56ccc19eea0bc8300ba42e67a26262e1e1f3141be61515c0
|
7
|
+
data.tar.gz: 5d326d26fb0354ea78236849be7328ce49f919435c7c373130f19439c2447f9026ce31f88bc64d5cc72581fadc618a0891cd777f885e438f01217295b58309e5
|
data/.gitignore
CHANGED
data/Gemfile.lock
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
fbcrawl-colly (0.1.1)
|
5
|
+
ffi
|
6
|
+
google-protobuf
|
7
|
+
|
8
|
+
GEM
|
9
|
+
remote: https://rubygems.org/
|
10
|
+
specs:
|
11
|
+
ffi (1.13.1)
|
12
|
+
google-protobuf (3.12.4-universal-darwin)
|
13
|
+
minitest (5.14.1)
|
14
|
+
rake (12.3.3)
|
15
|
+
rake-compiler (1.1.1)
|
16
|
+
rake
|
17
|
+
|
18
|
+
PLATFORMS
|
19
|
+
ruby
|
20
|
+
|
21
|
+
DEPENDENCIES
|
22
|
+
fbcrawl-colly!
|
23
|
+
minitest (~> 5.0)
|
24
|
+
rake (~> 12.0)
|
25
|
+
rake-compiler
|
26
|
+
|
27
|
+
BUNDLED WITH
|
28
|
+
2.1.4
|
data/Rakefile
CHANGED
@@ -7,4 +7,13 @@ Rake::TestTask.new(:test) do |t|
|
|
7
7
|
t.test_files = FileList["test/**/*_test.rb"]
|
8
8
|
end
|
9
9
|
|
10
|
+
task :fbcrawl_colly do
|
11
|
+
Dir.chdir("./ext/fbcrawl_colly/") do
|
12
|
+
require './extconf'
|
13
|
+
`make`
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
task :compile => [:fbcrawl_colly]
|
18
|
+
task :test => :compile
|
10
19
|
task :default => :test
|
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'mkmf'
|
2
|
-
|
3
|
-
MakeMakefile::find_executable '
|
4
|
-
MakeMakefile::find_executable 'protoc
|
5
|
-
|
2
|
+
requirement_passed = true
|
3
|
+
requirement_passed &&= MakeMakefile::find_executable 'go'
|
4
|
+
requirement_passed &&= MakeMakefile::find_executable 'protoc'
|
5
|
+
requirement_passed &&= MakeMakefile::find_executable 'protoc-gen-go'
|
6
|
+
$makefile_created = requirement_passed
|
data/fbcolly/fbcolly.go
CHANGED
@@ -10,14 +10,35 @@ import (
|
|
10
10
|
"github.com/gocolly/colly/extensions"
|
11
11
|
"github.com/gocolly/colly/storage"
|
12
12
|
"github.com/google/logger"
|
13
|
+
"github.com/olebedev/when"
|
14
|
+
"github.com/olebedev/when/rules/common"
|
15
|
+
"github.com/olebedev/when/rules/en"
|
16
|
+
"github.com/thoas/go-funk"
|
13
17
|
"net/url"
|
14
18
|
"qnetwork.net/fbcrawl/fbcrawl"
|
15
19
|
"regexp"
|
20
|
+
"strconv"
|
16
21
|
"strings"
|
22
|
+
"time"
|
17
23
|
)
|
18
24
|
|
19
25
|
type Fbcolly struct {
|
20
26
|
collector *colly.Collector
|
27
|
+
w *when.Parser
|
28
|
+
}
|
29
|
+
type FbDataPostContext struct {
|
30
|
+
PublishTime int64 `json:"publish_time"`
|
31
|
+
}
|
32
|
+
type FbDataInsight struct {
|
33
|
+
FbDataPostContext `json:"post_context"`
|
34
|
+
}
|
35
|
+
type FbDataFt struct {
|
36
|
+
ContentOwnerIdNew int64 `json:"content_owner_id_new"`
|
37
|
+
PhotoAttachmentsList []string `json:"photo_attachments_list"`
|
38
|
+
PhotoId int64 `json:"photo_id,string"`
|
39
|
+
PageId int64 `json:"page_id,string"`
|
40
|
+
TopLevelPostId int64 `json:"top_level_post_id,string"`
|
41
|
+
PageInsights map[string]FbDataInsight `json:"page_insights"`
|
21
42
|
}
|
22
43
|
|
23
44
|
func sharedOnRequest(request *colly.Request) {
|
@@ -79,16 +100,19 @@ func getForm(element *colly.HTMLElement, err error) (string, error, map[string]s
|
|
79
100
|
func New() *Fbcolly {
|
80
101
|
f := Fbcolly{}
|
81
102
|
f.collector = colly.NewCollector()
|
103
|
+
f.w = when.New(nil)
|
104
|
+
f.w.Add(en.All...)
|
105
|
+
f.w.Add(common.All...)
|
82
106
|
return &f
|
83
107
|
}
|
84
108
|
|
85
|
-
func (f *Fbcolly) Login(email string, password string, otp string) error {
|
109
|
+
func (f *Fbcolly) Login(email string, password string, otp string) (string, error) {
|
86
110
|
collector := f.collector.Clone()
|
87
|
-
setupSharedCollector(collector)
|
111
|
+
err := setupSharedCollector(collector)
|
88
112
|
|
89
113
|
logger.Info("Login using email", email)
|
114
|
+
loggedIn := false
|
90
115
|
|
91
|
-
var err error
|
92
116
|
collector.OnHTML("#login_form", func(element *colly.HTMLElement) {
|
93
117
|
logger.Info("OnHTML login_form")
|
94
118
|
loginURL, err, reqMap := getForm(element, err)
|
@@ -106,12 +130,13 @@ func (f *Fbcolly) Login(email string, password string, otp string) error {
|
|
106
130
|
})
|
107
131
|
|
108
132
|
collector.OnHTML("a[href=\"/login/save-device/cancel/?flow=interstitial_nux&nux_source=regular_login\"]", func(element *colly.HTMLElement) {
|
109
|
-
collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
|
133
|
+
err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
|
110
134
|
})
|
111
135
|
|
112
136
|
collector.OnHTML("form[action=\"/login/checkpoint/\"]", func(element *colly.HTMLElement) {
|
113
137
|
|
114
138
|
checkpointUrl, err, reqMap := getForm(element, err)
|
139
|
+
shouldSubmit := false
|
115
140
|
if err != nil {
|
116
141
|
logger.Error(err)
|
117
142
|
return
|
@@ -121,6 +146,7 @@ func (f *Fbcolly) Login(email string, password string, otp string) error {
|
|
121
146
|
//Save Device
|
122
147
|
logger.Info("OnHTML Save Device checkpoint")
|
123
148
|
reqMap["name_action_selected"] = "dont_save"
|
149
|
+
shouldSubmit = true
|
124
150
|
} else if element.DOM.Find("input[name=\"approvals_code\"]").Length() > 0 {
|
125
151
|
logger.Info("OnHTML OTP checkpoint")
|
126
152
|
//logger.Info("Please input OTP")
|
@@ -128,11 +154,15 @@ func (f *Fbcolly) Login(email string, password string, otp string) error {
|
|
128
154
|
//code, _ := reader.ReadString('\n')
|
129
155
|
code := otp[0:6]
|
130
156
|
reqMap["approvals_code"] = code
|
157
|
+
shouldSubmit = true
|
131
158
|
} else {
|
132
159
|
logger.Info("OnHTML Only Continue checkpoint")
|
160
|
+
|
161
|
+
}
|
162
|
+
if shouldSubmit {
|
163
|
+
logger.Info("req map:", reqMap)
|
164
|
+
err = collector.Post(checkpointUrl, reqMap)
|
133
165
|
}
|
134
|
-
logger.Info("req map:", reqMap)
|
135
|
-
err = collector.Post(checkpointUrl, reqMap)
|
136
166
|
if err != nil {
|
137
167
|
logger.Error("post err:", err)
|
138
168
|
}
|
@@ -141,19 +171,24 @@ func (f *Fbcolly) Login(email string, password string, otp string) error {
|
|
141
171
|
collector.OnHTML("form[action=\"/search/\"]", func(element *colly.HTMLElement) {
|
142
172
|
//We're in home
|
143
173
|
logger.Info("I'm IN HOME, navigate to page now")
|
174
|
+
loggedIn = true
|
144
175
|
})
|
145
176
|
|
146
177
|
err = collector.Visit("https://mbasic.facebook.com/")
|
147
178
|
if err != nil {
|
148
179
|
logger.Error("crawl by colly err:", err)
|
149
180
|
}
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
181
|
+
|
182
|
+
if loggedIn {
|
183
|
+
logger.Info(storage.StringifyCookies(collector.Cookies("https://mbasic.facebook.com/")))
|
184
|
+
return storage.StringifyCookies(collector.Cookies("https://mbasic.facebook.com/")), err
|
185
|
+
} else {
|
186
|
+
return "", err
|
187
|
+
}
|
188
|
+
|
154
189
|
}
|
155
190
|
|
156
|
-
func (f *Fbcolly) FetchGroupFeed(groupId
|
191
|
+
func (f *Fbcolly) FetchGroupFeed(groupId int64) (error, *fbcrawl.FacebookPostList) {
|
157
192
|
collector := f.collector.Clone()
|
158
193
|
err := setupSharedCollector(collector)
|
159
194
|
currentPage := 1
|
@@ -163,72 +198,179 @@ func (f *Fbcolly) FetchGroupFeed(groupId string) (error, *fbcrawl.FacebookPostLi
|
|
163
198
|
currentPage++
|
164
199
|
if currentPage < 3 {
|
165
200
|
logger.Info("Will fetch page", currentPage)
|
166
|
-
collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
|
201
|
+
err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
|
167
202
|
}
|
168
203
|
})
|
169
204
|
|
170
|
-
//TODO: May not need this
|
171
205
|
collector.OnXML("//a[text()=\"Full Story\"]", func(element *colly.XMLElement) {
|
206
|
+
logger.Info("Post found at", element.Attr("href"))
|
172
207
|
u, _ := url.Parse("http://mbasic.facebook.com" + element.Attr("href"))
|
173
|
-
|
208
|
+
postId, _ := strconv.ParseInt(u.Query().Get("id"), 10, 64)
|
174
209
|
result = append(result, &fbcrawl.FacebookPost{
|
175
|
-
Id:
|
210
|
+
Id: postId,
|
176
211
|
Group: &fbcrawl.FacebookGroup{Id: groupId},
|
177
212
|
})
|
178
213
|
//f.detailCollector.Visit(url)
|
179
214
|
})
|
180
215
|
|
181
|
-
err = collector.Visit("https://mbasic.facebook.com/groups
|
216
|
+
err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/groups/%d", groupId))
|
182
217
|
if err != nil {
|
183
218
|
logger.Error("crawl by colly err:", err)
|
184
219
|
}
|
185
220
|
return err, &fbcrawl.FacebookPostList{Posts: result}
|
186
221
|
}
|
187
222
|
|
188
|
-
func (f *Fbcolly)
|
223
|
+
func (f *Fbcolly) FetchContentImages(postId int64) (error, *fbcrawl.FacebookImageList) {
|
224
|
+
collector := f.collector.Clone()
|
225
|
+
err := setupSharedCollector(collector)
|
226
|
+
currentPage := 1
|
227
|
+
var result []*fbcrawl.FacebookImage
|
228
|
+
|
229
|
+
collector.OnHTML("a[href*=\"/media/set/\"]", func(element *colly.HTMLElement) {
|
230
|
+
currentPage++
|
231
|
+
logger.Info("Will fetch page", currentPage)
|
232
|
+
err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
|
233
|
+
})
|
234
|
+
|
235
|
+
collector.OnHTML("a[href*=\"/photo.php\"]", func(element *colly.HTMLElement) {
|
236
|
+
result = append(result, &fbcrawl.FacebookImage{
|
237
|
+
Id: getImageIdFromHref(element.Attr("href")),
|
238
|
+
})
|
239
|
+
//f.detailCollector.Visit(url)
|
240
|
+
})
|
241
|
+
|
242
|
+
err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/media/set/?set=pcb.%d", postId))
|
243
|
+
if err != nil {
|
244
|
+
logger.Error("crawl by colly err:", err)
|
245
|
+
}
|
246
|
+
return err, &fbcrawl.FacebookImageList{Images: result}
|
247
|
+
}
|
248
|
+
|
249
|
+
func (f *Fbcolly) FetchImageUrl(imageId int64) (error, *fbcrawl.FacebookImage) {
|
189
250
|
collector := f.collector.Clone()
|
190
251
|
err := setupSharedCollector(collector)
|
191
|
-
|
252
|
+
result := fbcrawl.FacebookImage{Id: imageId}
|
253
|
+
|
254
|
+
collector.OnHTML("a", func(element *colly.HTMLElement) {
|
255
|
+
result.Url = element.Attr("href")
|
256
|
+
})
|
257
|
+
|
258
|
+
err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/photo/view_full_size/?fbid=%d", imageId))
|
259
|
+
if err != nil {
|
260
|
+
logger.Error("crawl by colly err:", err)
|
261
|
+
}
|
262
|
+
return err, &result
|
263
|
+
}
|
264
|
+
|
265
|
+
func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.FacebookPost) {
|
266
|
+
collector := f.collector.Clone()
|
267
|
+
err := setupSharedCollector(collector)
|
268
|
+
post := &fbcrawl.FacebookPost{Comments: []*fbcrawl.FacebookComment{}}
|
269
|
+
commentPaging := 0
|
192
270
|
collector.OnHTML("#m_story_permalink_view", func(element *colly.HTMLElement) {
|
193
271
|
dataElement := element.DOM.Find("div[data-ft]")
|
194
272
|
if dataElement.Length() > 0 {
|
195
|
-
var result
|
273
|
+
var result FbDataFt
|
196
274
|
jsonData, isExist := dataElement.Attr("data-ft")
|
197
275
|
if isExist {
|
198
|
-
|
276
|
+
logger.Info(jsonData)
|
277
|
+
err = json.Unmarshal([]byte(jsonData), &result)
|
278
|
+
if err != nil {
|
279
|
+
logger.Error(err)
|
280
|
+
return
|
281
|
+
}
|
199
282
|
logger.Info("Post ", result)
|
200
|
-
post.Id = result
|
201
|
-
post.Group = &fbcrawl.FacebookGroup{Id: result
|
202
|
-
post.User = &fbcrawl.FacebookUser{
|
283
|
+
post.Id = result.TopLevelPostId
|
284
|
+
post.Group = &fbcrawl.FacebookGroup{Id: result.PageId, Name: dataElement.Find("h3 strong:last-child a").Text()}
|
285
|
+
post.User = &fbcrawl.FacebookUser{
|
286
|
+
Id: result.ContentOwnerIdNew,
|
287
|
+
Name: dataElement.Find("h3 strong:first-child a").Text(),
|
288
|
+
}
|
289
|
+
post.CreatedAt = result.PageInsights[strconv.FormatInt(result.PageId, 10)].PublishTime
|
203
290
|
//Content
|
291
|
+
|
292
|
+
//NO BACKGROUND TEXT ONLY
|
204
293
|
post.Content = strings.Join(dataElement.Find("p").Map(func(i int, selection *goquery.Selection) string {
|
205
294
|
return selection.Text()
|
206
295
|
}), "\n")
|
207
296
|
|
297
|
+
if len(post.Content) == 0 {
|
298
|
+
// TEXT WITH BACKGROUND
|
299
|
+
post.Content = dataElement.Find("div[style*=\"background-image:url\"]").Text()
|
300
|
+
}
|
301
|
+
|
302
|
+
post.ContentLink = getUrlFromRedirectHref(dataElement.Find("a[href*=\"https://lm.facebook.com/l.php\"]").AttrOr("href", ""))
|
303
|
+
|
304
|
+
post.ContentImages = (funk.Map(result.PhotoAttachmentsList, func(id string) *fbcrawl.FacebookImage {
|
305
|
+
i, _ := strconv.ParseInt(id, 10, 64)
|
306
|
+
return &fbcrawl.FacebookImage{
|
307
|
+
Id: i,
|
308
|
+
}
|
309
|
+
})).([]*fbcrawl.FacebookImage)
|
310
|
+
|
311
|
+
if result.PhotoId > 0 {
|
312
|
+
post.ContentImage = &fbcrawl.FacebookImage{Id: result.PhotoId}
|
313
|
+
}
|
314
|
+
|
208
315
|
logger.Info("content", strings.Join(dataElement.Find("p").Map(func(i int, selection *goquery.Selection) string {
|
209
316
|
return selection.Text()
|
210
317
|
}), "\n"))
|
211
318
|
}
|
212
|
-
|
319
|
+
|
213
320
|
//Comment
|
214
321
|
element.DOM.Find("h3 + div + div + div").Parent().Parent().Each(func(i int, selection *goquery.Selection) {
|
215
322
|
//author
|
216
|
-
commentId := selection.AttrOr("id", "")
|
323
|
+
commentId, _ := strconv.ParseInt(selection.AttrOr("id", ""), 10, 64)
|
217
324
|
logger.Info("comment", commentId)
|
218
|
-
|
219
|
-
//idRegex.FindString()
|
325
|
+
createdAtWhenResult, _ := f.w.Parse(selection.Find("abbr").Text(), time.Now())
|
220
326
|
post.Comments = append(post.Comments, &fbcrawl.FacebookComment{
|
221
327
|
Id: commentId,
|
222
328
|
Post: &fbcrawl.FacebookPost{Id: post.Id},
|
223
329
|
User: &fbcrawl.FacebookUser{
|
224
|
-
Id:
|
330
|
+
Id: getUserIdFromCommentHref(selection.Find("a[href*=\"#comment_form_\"]").AttrOr("href", "")),
|
225
331
|
Name: selection.Find("h3 > a").Text(),
|
226
332
|
},
|
227
|
-
Content:
|
333
|
+
Content: selection.Find("h3 + div").Text(),
|
334
|
+
CreatedAt: createdAtWhenResult.Time.Unix(),
|
228
335
|
})
|
229
336
|
})
|
337
|
+
|
338
|
+
}
|
339
|
+
})
|
340
|
+
|
341
|
+
collector.OnHTML("div[id*=\"see_prev_\"] > a", func(element *colly.HTMLElement) {
|
342
|
+
if commentPaging < 3 {
|
343
|
+
logger.Info("Comment paging", commentPaging)
|
344
|
+
err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
|
345
|
+
commentPaging = commentPaging + 1
|
230
346
|
}
|
231
347
|
})
|
232
|
-
|
348
|
+
|
349
|
+
err = collector.Visit(fmt.Sprintf("http://mbasic.facebook.com/groups/%d?view=permalink&id=%d&_rdr", groupId, postId))
|
233
350
|
return err, post
|
234
351
|
}
|
352
|
+
|
353
|
+
func (f *Fbcolly) LoginWithCookies(cookies string) error {
|
354
|
+
collector := f.collector
|
355
|
+
return collector.SetCookies("https://mbasic.facebook.com/", storage.UnstringifyCookies(cookies))
|
356
|
+
}
|
357
|
+
|
358
|
+
//func getUsernameFromHref(href string) string {
|
359
|
+
// return regexp.MustCompile("/([\\d\\w.]+).*").FindStringSubmatch(href)[1]
|
360
|
+
//}
|
361
|
+
|
362
|
+
func getUserIdFromCommentHref(href string) int64 {
|
363
|
+
id, _ := strconv.ParseInt(regexp.MustCompile("#comment_form_(\\d+)").FindStringSubmatch(href)[1], 10, 64)
|
364
|
+
return id
|
365
|
+
}
|
366
|
+
|
367
|
+
func getUrlFromRedirectHref(href string) string {
|
368
|
+
u, _ := url.Parse(href)
|
369
|
+
return u.Query().Get("u")
|
370
|
+
}
|
371
|
+
|
372
|
+
func getImageIdFromHref(href string) int64 {
|
373
|
+
u, _ := url.Parse(href)
|
374
|
+
i, _ := strconv.ParseInt(u.Query().Get("fbid"), 10, 64)
|
375
|
+
return i
|
376
|
+
}
|
data/fbcrawl-colly.gemspec
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
require_relative 'lib/
|
1
|
+
require_relative 'lib/fbcrawl_colly/version'
|
2
2
|
|
3
3
|
Gem::Specification.new do |spec|
|
4
4
|
spec.name = "fbcrawl-colly"
|
@@ -32,4 +32,5 @@ Gem::Specification.new do |spec|
|
|
32
32
|
|
33
33
|
spec.add_runtime_dependency 'ffi'
|
34
34
|
spec.add_runtime_dependency 'google-protobuf'
|
35
|
+
spec.add_development_dependency 'rake-compiler'
|
35
36
|
end
|
data/fbcrawl.proto
CHANGED
@@ -1,33 +1,48 @@
|
|
1
1
|
syntax = "proto3";
|
2
2
|
|
3
|
+
package fbcrawl_colly;
|
3
4
|
option go_package = "./fbcrawl;fbcrawl";
|
4
5
|
|
5
6
|
// The request message containing the user's name.
|
6
7
|
message FacebookGroup {
|
7
|
-
|
8
|
+
int64 id = 1;
|
8
9
|
string name = 2;
|
9
10
|
}
|
10
11
|
|
11
12
|
message FacebookUser {
|
12
|
-
|
13
|
+
int64 id = 1;
|
13
14
|
string name = 2;
|
14
15
|
}
|
15
16
|
|
16
17
|
message FacebookPost {
|
17
|
-
|
18
|
+
int64 id = 1;
|
18
19
|
FacebookGroup group = 2;
|
19
20
|
FacebookUser user = 3;
|
20
21
|
string content = 4;
|
22
|
+
string content_link = 6;
|
23
|
+
FacebookImage content_image = 8;
|
24
|
+
repeated FacebookImage content_images = 7;
|
21
25
|
repeated FacebookComment comments = 5;
|
26
|
+
int64 created_at = 9;
|
27
|
+
}
|
28
|
+
|
29
|
+
message FacebookImage {
|
30
|
+
int64 id = 1;
|
31
|
+
string url = 2;
|
22
32
|
}
|
23
33
|
|
24
34
|
message FacebookComment {
|
25
|
-
|
35
|
+
int64 id = 1;
|
26
36
|
FacebookPost post = 2;
|
27
37
|
FacebookUser user = 3;
|
28
38
|
string content = 4;
|
39
|
+
int64 created_at = 5;
|
29
40
|
}
|
30
41
|
|
31
42
|
message FacebookPostList {
|
32
43
|
repeated FacebookPost posts = 1;
|
33
44
|
}
|
45
|
+
|
46
|
+
message FacebookImageList {
|
47
|
+
repeated FacebookImage images = 1;
|
48
|
+
}
|
data/go.mod
CHANGED
@@ -13,6 +13,7 @@ require (
|
|
13
13
|
github.com/golang/protobuf v1.4.2
|
14
14
|
github.com/google/logger v1.1.0
|
15
15
|
github.com/kennygrant/sanitize v1.2.4 // indirect
|
16
|
+
github.com/olebedev/when v0.0.0-20190311101825-c3b538a97254
|
16
17
|
github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca // indirect
|
17
18
|
github.com/temoto/robotstxt v1.1.1 // indirect
|
18
19
|
github.com/thoas/go-funk v0.7.0
|
data/go.sum
CHANGED
@@ -1,4 +1,6 @@
|
|
1
1
|
cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
|
2
|
+
github.com/AlekSi/pointer v1.0.0 h1:KWCWzsvFxNLcmM5XmiqHsGTTsuwZMsLFwWF9Y+//bNE=
|
3
|
+
github.com/AlekSi/pointer v1.0.0/go.mod h1:1kjywbfcPFCmncIxtk6fIEub6LKrfMz3gc5QKVOSOA8=
|
2
4
|
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
|
3
5
|
github.com/PuerkitoBio/goquery v1.5.1 h1:PSPBGne8NIUWw+/7vFBV+kG2J/5MOjbzc7154OaKCSE=
|
4
6
|
github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc=
|
@@ -18,6 +20,7 @@ github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA
|
|
18
20
|
github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
|
19
21
|
github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8=
|
20
22
|
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
23
|
+
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
21
24
|
github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
|
22
25
|
github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c=
|
23
26
|
github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y=
|
@@ -50,6 +53,10 @@ github.com/google/logger v1.1.0 h1:saB74Etb4EAJNH3z74CVbCKk75hld/8T0CsXKetWCwM=
|
|
50
53
|
github.com/google/logger v1.1.0/go.mod h1:w7O8nrRr0xufejBlQMI83MXqRusvREoJdaAxV+CoAB4=
|
51
54
|
github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8NzpJ3o=
|
52
55
|
github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak=
|
56
|
+
github.com/olebedev/when v0.0.0-20190311101825-c3b538a97254 h1:JYoQR67E1vv1WGoeW8DkdFs7vrIEe/5wP+qJItd5tUE=
|
57
|
+
github.com/olebedev/when v0.0.0-20190311101825-c3b538a97254/go.mod h1:DPucAeQGDPUzYUt+NaWw6qsF5SFapWWToxEiVDh2aV0=
|
58
|
+
github.com/pkg/errors v0.8.1 h1:iURUrRGxPUNPdy5/HRSm+Yj6okJ6UtLINN0Q9M4+h3I=
|
59
|
+
github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
|
53
60
|
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
54
61
|
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
55
62
|
github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
|
data/lib/fbcrawl-colly.rb
CHANGED
@@ -1,16 +1,4 @@
|
|
1
|
-
require 'ffi'
|
2
|
-
require 'fbcrawl_pb'
|
3
|
-
module FbcrawlColly
|
4
|
-
extend FFI::Library
|
5
1
|
|
6
|
-
|
7
|
-
attach_function :free, [ :pointer ], :void
|
2
|
+
module FbcrawlColly
|
8
3
|
|
9
|
-
attach_function :Init, [], :pointer
|
10
|
-
attach_function :Login, [:pointer, :string, :string], :void
|
11
|
-
attach_function :FetchGroupFeed, [:pointer, :string], :string
|
12
|
-
attach_function :FetchPost, [:pointer, :string, :string], :string
|
13
|
-
# attach_function :FetchGroup, [:pointer, :string], :pointer
|
14
|
-
# attach_function :Login, [:pointer, :string, :string, :string], :void
|
15
|
-
# attach_function :FreePointer, [:pointer], :void
|
16
4
|
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
require 'ffi'
|
2
|
+
require_relative '../fbcrawl_pb'
|
3
|
+
require_relative './ffi'
|
4
|
+
|
5
|
+
module FbcrawlColly
|
6
|
+
class Colly
|
7
|
+
def initialize
|
8
|
+
super
|
9
|
+
@colly = ::FFI::AutoPointer.new(FbcrawlColly::FFI::Init(), FbcrawlColly::FFI.method(:FreeColly))
|
10
|
+
end
|
11
|
+
|
12
|
+
def login(email, password)
|
13
|
+
s, ptr = FbcrawlColly::FFI.Login(@colly, email, password)
|
14
|
+
FbcrawlColly::FFI.free(ptr)
|
15
|
+
s
|
16
|
+
end
|
17
|
+
|
18
|
+
def login_with_cookies(cookies)
|
19
|
+
FbcrawlColly::FFI.LoginWithCookies(@colly, cookies)
|
20
|
+
end
|
21
|
+
|
22
|
+
def fetch_group_feed(group_id)
|
23
|
+
s, ptr = FbcrawlColly::FFI.FetchGroupFeed(@colly, group_id)
|
24
|
+
list = FbcrawlColly::FacebookPostList.decode(s)
|
25
|
+
FbcrawlColly::FFI.free(ptr)
|
26
|
+
list
|
27
|
+
end
|
28
|
+
|
29
|
+
def fetch_post(group_id, post_id)
|
30
|
+
s, ptr = FbcrawlColly::FFI.FetchPost(@colly, group_id, post_id)
|
31
|
+
post = FbcrawlColly::FacebookPost.decode(s)
|
32
|
+
FbcrawlColly::FFI.free(ptr)
|
33
|
+
post
|
34
|
+
end
|
35
|
+
|
36
|
+
def fetch_content_images(post_id)
|
37
|
+
s, ptr = FbcrawlColly::FFI.FetchContentImages(@colly, post_id)
|
38
|
+
imageList = FbcrawlColly::FacebookImageList.decode(s)
|
39
|
+
FbcrawlColly::FFI.free(ptr)
|
40
|
+
imageList
|
41
|
+
end
|
42
|
+
|
43
|
+
def fetch_image_url(image_id)
|
44
|
+
s, ptr = FbcrawlColly::FFI.FetchImageUrl(@colly, image_id)
|
45
|
+
image = FbcrawlColly::FacebookImage.decode(s)
|
46
|
+
FbcrawlColly::FFI.free(ptr)
|
47
|
+
image
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
require 'ffi'
|
2
|
+
module FbcrawlColly::FFI
|
3
|
+
extend FFI::Library
|
4
|
+
|
5
|
+
ffi_lib File.expand_path("../../ext/fbcrawl_colly/fbcolly.so", File.dirname(__FILE__))
|
6
|
+
attach_function :free, [ :pointer ], :void
|
7
|
+
|
8
|
+
attach_function :Init, [], :pointer
|
9
|
+
attach_function :FreeColly, [:pointer], :pointer
|
10
|
+
attach_function :Login, [:pointer, :string, :string], :strptr
|
11
|
+
attach_function :LoginWithCookies, [:pointer, :string], :void
|
12
|
+
attach_function :FetchGroupFeed, [:pointer, :int64], :strptr
|
13
|
+
attach_function :FetchPost, [:pointer, :int64, :int64], :strptr
|
14
|
+
attach_function :FetchContentImages, [:pointer, :int64], :strptr
|
15
|
+
attach_function :FetchImageUrl, [:pointer, :int64], :strptr
|
16
|
+
# attach_function :FetchGroup, [:pointer, :string], :pointer
|
17
|
+
end
|
data/main.go
CHANGED
@@ -27,36 +27,69 @@ var password = flag.String("password", "change_me", "facebook password")
|
|
27
27
|
var otp = flag.String("otp", "123456", "facebook otp")
|
28
28
|
var groupId = flag.String("groupId", "334294967318328", "facebook group id, default is 334294967318328")
|
29
29
|
|
30
|
-
var
|
30
|
+
var allInstances = map[uintptr]*fbcolly.Fbcolly{}
|
31
31
|
|
32
32
|
//export Init
|
33
33
|
func Init() uintptr {
|
34
|
-
|
34
|
+
instance := fbcolly.New()
|
35
|
+
ptr := (uintptr)(unsafe.Pointer(instance))
|
36
|
+
allInstances[ptr] = instance
|
37
|
+
return ptr
|
38
|
+
}
|
39
|
+
|
40
|
+
//export FreeColly
|
41
|
+
func FreeColly(pointer unsafe.Pointer) {
|
42
|
+
delete(allInstances, uintptr(pointer))
|
35
43
|
}
|
36
44
|
|
37
45
|
//export Login
|
38
|
-
func Login(pointer unsafe.Pointer, email *C.char, password *C.char) {
|
46
|
+
func Login(pointer unsafe.Pointer, email *C.char, password *C.char) *C.char {
|
47
|
+
p := (*fbcolly.Fbcolly)(pointer)
|
48
|
+
cookies, err := p.Login(C.GoString(email), C.GoString(password), "")
|
49
|
+
if err == nil {
|
50
|
+
return C.CString(cookies)
|
51
|
+
}
|
52
|
+
return nil
|
53
|
+
}
|
54
|
+
|
55
|
+
//export LoginWithCookies
|
56
|
+
func LoginWithCookies(pointer unsafe.Pointer, cookies *C.char) {
|
39
57
|
p := (*fbcolly.Fbcolly)(pointer)
|
40
|
-
|
41
|
-
p.Login(C.GoString(email), C.GoString(password), "")
|
58
|
+
p.LoginWithCookies(C.GoString(cookies))
|
42
59
|
}
|
43
60
|
|
44
61
|
//export FetchGroupFeed
|
45
|
-
func FetchGroupFeed(pointer unsafe.Pointer, groupId
|
62
|
+
func FetchGroupFeed(pointer unsafe.Pointer, groupId int64) unsafe.Pointer {
|
46
63
|
p := (*fbcolly.Fbcolly)(pointer)
|
47
|
-
_, postsList := p.FetchGroupFeed(
|
64
|
+
_, postsList := p.FetchGroupFeed(groupId)
|
48
65
|
marshaledPostsList, _ := proto.Marshal(postsList)
|
49
66
|
return C.CBytes(append(marshaledPostsList, 0))
|
50
67
|
}
|
51
68
|
|
52
69
|
//export FetchPost
|
53
|
-
func FetchPost(pointer unsafe.Pointer, groupId
|
70
|
+
func FetchPost(pointer unsafe.Pointer, groupId int64, postId int64) unsafe.Pointer {
|
54
71
|
p := (*fbcolly.Fbcolly)(pointer)
|
55
|
-
_, post := p.FetchPost(
|
72
|
+
_, post := p.FetchPost(groupId, postId)
|
56
73
|
marshaledPost, _ := proto.Marshal(post)
|
57
74
|
return C.CBytes(append(marshaledPost, 0))
|
58
75
|
}
|
59
76
|
|
77
|
+
//export FetchContentImages
|
78
|
+
func FetchContentImages(pointer unsafe.Pointer, postId int64) unsafe.Pointer {
|
79
|
+
p := (*fbcolly.Fbcolly)(pointer)
|
80
|
+
_, imageList := p.FetchContentImages(postId)
|
81
|
+
marshaled, _ := proto.Marshal(imageList)
|
82
|
+
return C.CBytes(append(marshaled, 0))
|
83
|
+
}
|
84
|
+
|
85
|
+
//export FetchImageUrl
|
86
|
+
func FetchImageUrl(pointer unsafe.Pointer, imageId int64) unsafe.Pointer {
|
87
|
+
p := (*fbcolly.Fbcolly)(pointer)
|
88
|
+
_, image := p.FetchImageUrl(imageId)
|
89
|
+
marshaled, _ := proto.Marshal(image)
|
90
|
+
return C.CBytes(append(marshaled, 0))
|
91
|
+
}
|
92
|
+
|
60
93
|
func main() {
|
61
94
|
//r := regexp.MustCompile("/([\\d\\w.]+)").FindStringSubmatch()[1]
|
62
95
|
//print(r.FindStringSubmatch("/liem.phamthanh.161?refid=18&__tn__=R")[1])
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fbcrawl-colly
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Duy Le
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-08-
|
11
|
+
date: 2020-08-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ffi
|
@@ -38,6 +38,20 @@ dependencies:
|
|
38
38
|
- - ">="
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rake-compiler
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
41
55
|
description: Crawl mbasic.facebook.com using GO Colly
|
42
56
|
email:
|
43
57
|
- duyleekun@gmail.com
|
@@ -47,9 +61,9 @@ extensions:
|
|
47
61
|
extra_rdoc_files: []
|
48
62
|
files:
|
49
63
|
- ".gitignore"
|
50
|
-
- ".travis.yml"
|
51
64
|
- CODE_OF_CONDUCT.md
|
52
65
|
- Gemfile
|
66
|
+
- Gemfile.lock
|
53
67
|
- LICENSE.txt
|
54
68
|
- README.md
|
55
69
|
- Rakefile
|
@@ -64,7 +78,9 @@ files:
|
|
64
78
|
- go.mod
|
65
79
|
- go.sum
|
66
80
|
- lib/fbcrawl-colly.rb
|
67
|
-
- lib/
|
81
|
+
- lib/fbcrawl_colly/colly.rb
|
82
|
+
- lib/fbcrawl_colly/ffi.rb
|
83
|
+
- lib/fbcrawl_colly/version.rb
|
68
84
|
- main.go
|
69
85
|
homepage: http://github.com/duyleekun/fbcrawl-colly
|
70
86
|
licenses:
|