fbcrawl-colly 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 96d233862f21d1dff4a447d57f0f4a4ee6311506d7b5cf1990d75bf92043defa
4
- data.tar.gz: a238ee4a34eac5f33fa90bba0d5bdb22169bd0b54034682f8fbb25c1ca4de039
3
+ metadata.gz: 22ec88aaf52a44344fc32ab22f6914f264b49ea10912c527c030ffd97ac12d36
4
+ data.tar.gz: d007e4326b15c64e725009548c8e9dac00e263da888d736876b6093a4fba5108
5
5
  SHA512:
6
- metadata.gz: 36d58d4ef38bf94164f94fe0a841171cc5bdd12a4f2503e818858d77cae029e1fc79cddd3213b2bb9e130e70833703df97ad18700d92c2cc3e31401b8e1f1443
7
- data.tar.gz: 5c56a623910f87369812c0d74908e618490b4aeead9d22b055f8076ca040ba9fe5fff0bb380c44e7e435e0d83a13823b259b0f6c57233d7fb3ec38ff83de0832
6
+ metadata.gz: 448bbb6b045d3b4baa3c0eae23bad85f0daf93159f514ff77e0f9383350b1c52aa3ab5360646099e56ccc19eea0bc8300ba42e67a26262e1e1f3141be61515c0
7
+ data.tar.gz: 5d326d26fb0354ea78236849be7328ce49f919435c7c373130f19439c2447f9026ce31f88bc64d5cc72581fadc618a0891cd777f885e438f01217295b58309e5
data/.gitignore CHANGED
@@ -14,3 +14,6 @@ last.html
14
14
  *.db
15
15
  /fbcrawl/fbcrawl.pb.go
16
16
  /lib/fbcrawl_pb.rb
17
+
18
+ mkmf.log
19
+ .rakeTasks
@@ -0,0 +1,28 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ fbcrawl-colly (0.1.1)
5
+ ffi
6
+ google-protobuf
7
+
8
+ GEM
9
+ remote: https://rubygems.org/
10
+ specs:
11
+ ffi (1.13.1)
12
+ google-protobuf (3.12.4-universal-darwin)
13
+ minitest (5.14.1)
14
+ rake (12.3.3)
15
+ rake-compiler (1.1.1)
16
+ rake
17
+
18
+ PLATFORMS
19
+ ruby
20
+
21
+ DEPENDENCIES
22
+ fbcrawl-colly!
23
+ minitest (~> 5.0)
24
+ rake (~> 12.0)
25
+ rake-compiler
26
+
27
+ BUNDLED WITH
28
+ 2.1.4
data/Rakefile CHANGED
@@ -7,4 +7,13 @@ Rake::TestTask.new(:test) do |t|
7
7
  t.test_files = FileList["test/**/*_test.rb"]
8
8
  end
9
9
 
10
+ task :fbcrawl_colly do
11
+ Dir.chdir("./ext/fbcrawl_colly/") do
12
+ require './extconf'
13
+ `make`
14
+ end
15
+ end
16
+
17
+ task :compile => [:fbcrawl_colly]
18
+ task :test => :compile
10
19
  task :default => :test
@@ -1,5 +1,6 @@
1
1
  require 'mkmf'
2
- MakeMakefile::find_executable 'go'
3
- MakeMakefile::find_executable 'protoc'
4
- MakeMakefile::find_executable 'protoc-gen-go'
5
- $makefile_created = true
2
+ requirement_passed = true
3
+ requirement_passed &&= MakeMakefile::find_executable 'go'
4
+ requirement_passed &&= MakeMakefile::find_executable 'protoc'
5
+ requirement_passed &&= MakeMakefile::find_executable 'protoc-gen-go'
6
+ $makefile_created = requirement_passed
@@ -10,14 +10,35 @@ import (
10
10
  "github.com/gocolly/colly/extensions"
11
11
  "github.com/gocolly/colly/storage"
12
12
  "github.com/google/logger"
13
+ "github.com/olebedev/when"
14
+ "github.com/olebedev/when/rules/common"
15
+ "github.com/olebedev/when/rules/en"
16
+ "github.com/thoas/go-funk"
13
17
  "net/url"
14
18
  "qnetwork.net/fbcrawl/fbcrawl"
15
19
  "regexp"
20
+ "strconv"
16
21
  "strings"
22
+ "time"
17
23
  )
18
24
 
19
25
  type Fbcolly struct {
20
26
  collector *colly.Collector
27
+ w *when.Parser
28
+ }
29
+ type FbDataPostContext struct {
30
+ PublishTime int64 `json:"publish_time"`
31
+ }
32
+ type FbDataInsight struct {
33
+ FbDataPostContext `json:"post_context"`
34
+ }
35
+ type FbDataFt struct {
36
+ ContentOwnerIdNew int64 `json:"content_owner_id_new"`
37
+ PhotoAttachmentsList []string `json:"photo_attachments_list"`
38
+ PhotoId int64 `json:"photo_id,string"`
39
+ PageId int64 `json:"page_id,string"`
40
+ TopLevelPostId int64 `json:"top_level_post_id,string"`
41
+ PageInsights map[string]FbDataInsight `json:"page_insights"`
21
42
  }
22
43
 
23
44
  func sharedOnRequest(request *colly.Request) {
@@ -79,16 +100,19 @@ func getForm(element *colly.HTMLElement, err error) (string, error, map[string]s
79
100
  func New() *Fbcolly {
80
101
  f := Fbcolly{}
81
102
  f.collector = colly.NewCollector()
103
+ f.w = when.New(nil)
104
+ f.w.Add(en.All...)
105
+ f.w.Add(common.All...)
82
106
  return &f
83
107
  }
84
108
 
85
- func (f *Fbcolly) Login(email string, password string, otp string) error {
109
+ func (f *Fbcolly) Login(email string, password string, otp string) (string, error) {
86
110
  collector := f.collector.Clone()
87
- setupSharedCollector(collector)
111
+ err := setupSharedCollector(collector)
88
112
 
89
113
  logger.Info("Login using email", email)
114
+ loggedIn := false
90
115
 
91
- var err error
92
116
  collector.OnHTML("#login_form", func(element *colly.HTMLElement) {
93
117
  logger.Info("OnHTML login_form")
94
118
  loginURL, err, reqMap := getForm(element, err)
@@ -106,12 +130,13 @@ func (f *Fbcolly) Login(email string, password string, otp string) error {
106
130
  })
107
131
 
108
132
  collector.OnHTML("a[href=\"/login/save-device/cancel/?flow=interstitial_nux&nux_source=regular_login\"]", func(element *colly.HTMLElement) {
109
- collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
133
+ err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
110
134
  })
111
135
 
112
136
  collector.OnHTML("form[action=\"/login/checkpoint/\"]", func(element *colly.HTMLElement) {
113
137
 
114
138
  checkpointUrl, err, reqMap := getForm(element, err)
139
+ shouldSubmit := false
115
140
  if err != nil {
116
141
  logger.Error(err)
117
142
  return
@@ -121,6 +146,7 @@ func (f *Fbcolly) Login(email string, password string, otp string) error {
121
146
  //Save Device
122
147
  logger.Info("OnHTML Save Device checkpoint")
123
148
  reqMap["name_action_selected"] = "dont_save"
149
+ shouldSubmit = true
124
150
  } else if element.DOM.Find("input[name=\"approvals_code\"]").Length() > 0 {
125
151
  logger.Info("OnHTML OTP checkpoint")
126
152
  //logger.Info("Please input OTP")
@@ -128,11 +154,15 @@ func (f *Fbcolly) Login(email string, password string, otp string) error {
128
154
  //code, _ := reader.ReadString('\n')
129
155
  code := otp[0:6]
130
156
  reqMap["approvals_code"] = code
157
+ shouldSubmit = true
131
158
  } else {
132
159
  logger.Info("OnHTML Only Continue checkpoint")
160
+
161
+ }
162
+ if shouldSubmit {
163
+ logger.Info("req map:", reqMap)
164
+ err = collector.Post(checkpointUrl, reqMap)
133
165
  }
134
- logger.Info("req map:", reqMap)
135
- err = collector.Post(checkpointUrl, reqMap)
136
166
  if err != nil {
137
167
  logger.Error("post err:", err)
138
168
  }
@@ -141,19 +171,24 @@ func (f *Fbcolly) Login(email string, password string, otp string) error {
141
171
  collector.OnHTML("form[action=\"/search/\"]", func(element *colly.HTMLElement) {
142
172
  //We're in home
143
173
  logger.Info("I'm IN HOME, navigate to page now")
174
+ loggedIn = true
144
175
  })
145
176
 
146
177
  err = collector.Visit("https://mbasic.facebook.com/")
147
178
  if err != nil {
148
179
  logger.Error("crawl by colly err:", err)
149
180
  }
150
- logger.Info(storage.StringifyCookies(collector.Cookies("https://mbasic.facebook.com/")))
151
- //return err, storage.StringifyCookies(collector.Cookies("https://mbasic.facebook.com/"))
152
- //return err, collector.getS.Cookies("https://mbasic.facebook.com/")
153
- return err
181
+
182
+ if loggedIn {
183
+ logger.Info(storage.StringifyCookies(collector.Cookies("https://mbasic.facebook.com/")))
184
+ return storage.StringifyCookies(collector.Cookies("https://mbasic.facebook.com/")), err
185
+ } else {
186
+ return "", err
187
+ }
188
+
154
189
  }
155
190
 
156
- func (f *Fbcolly) FetchGroupFeed(groupId string) (error, *fbcrawl.FacebookPostList) {
191
+ func (f *Fbcolly) FetchGroupFeed(groupId int64) (error, *fbcrawl.FacebookPostList) {
157
192
  collector := f.collector.Clone()
158
193
  err := setupSharedCollector(collector)
159
194
  currentPage := 1
@@ -163,72 +198,179 @@ func (f *Fbcolly) FetchGroupFeed(groupId string) (error, *fbcrawl.FacebookPostLi
163
198
  currentPage++
164
199
  if currentPage < 3 {
165
200
  logger.Info("Will fetch page", currentPage)
166
- collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
201
+ err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
167
202
  }
168
203
  })
169
204
 
170
- //TODO: May not need this
171
205
  collector.OnXML("//a[text()=\"Full Story\"]", func(element *colly.XMLElement) {
206
+ logger.Info("Post found at", element.Attr("href"))
172
207
  u, _ := url.Parse("http://mbasic.facebook.com" + element.Attr("href"))
173
-
208
+ postId, _ := strconv.ParseInt(u.Query().Get("id"), 10, 64)
174
209
  result = append(result, &fbcrawl.FacebookPost{
175
- Id: u.Query().Get("id"),
210
+ Id: postId,
176
211
  Group: &fbcrawl.FacebookGroup{Id: groupId},
177
212
  })
178
213
  //f.detailCollector.Visit(url)
179
214
  })
180
215
 
181
- err = collector.Visit("https://mbasic.facebook.com/groups/" + groupId)
216
+ err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/groups/%d", groupId))
182
217
  if err != nil {
183
218
  logger.Error("crawl by colly err:", err)
184
219
  }
185
220
  return err, &fbcrawl.FacebookPostList{Posts: result}
186
221
  }
187
222
 
188
- func (f *Fbcolly) FetchPost(groupId string, postId string) (error, *fbcrawl.FacebookPost) {
223
+ func (f *Fbcolly) FetchContentImages(postId int64) (error, *fbcrawl.FacebookImageList) {
224
+ collector := f.collector.Clone()
225
+ err := setupSharedCollector(collector)
226
+ currentPage := 1
227
+ var result []*fbcrawl.FacebookImage
228
+
229
+ collector.OnHTML("a[href*=\"/media/set/\"]", func(element *colly.HTMLElement) {
230
+ currentPage++
231
+ logger.Info("Will fetch page", currentPage)
232
+ err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
233
+ })
234
+
235
+ collector.OnHTML("a[href*=\"/photo.php\"]", func(element *colly.HTMLElement) {
236
+ result = append(result, &fbcrawl.FacebookImage{
237
+ Id: getImageIdFromHref(element.Attr("href")),
238
+ })
239
+ //f.detailCollector.Visit(url)
240
+ })
241
+
242
+ err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/media/set/?set=pcb.%d", postId))
243
+ if err != nil {
244
+ logger.Error("crawl by colly err:", err)
245
+ }
246
+ return err, &fbcrawl.FacebookImageList{Images: result}
247
+ }
248
+
249
+ func (f *Fbcolly) FetchImageUrl(imageId int64) (error, *fbcrawl.FacebookImage) {
189
250
  collector := f.collector.Clone()
190
251
  err := setupSharedCollector(collector)
191
- post := &fbcrawl.FacebookPost{}
252
+ result := fbcrawl.FacebookImage{Id: imageId}
253
+
254
+ collector.OnHTML("a", func(element *colly.HTMLElement) {
255
+ result.Url = element.Attr("href")
256
+ })
257
+
258
+ err = collector.Visit(fmt.Sprintf("https://mbasic.facebook.com/photo/view_full_size/?fbid=%d", imageId))
259
+ if err != nil {
260
+ logger.Error("crawl by colly err:", err)
261
+ }
262
+ return err, &result
263
+ }
264
+
265
+ func (f *Fbcolly) FetchPost(groupId int64, postId int64) (error, *fbcrawl.FacebookPost) {
266
+ collector := f.collector.Clone()
267
+ err := setupSharedCollector(collector)
268
+ post := &fbcrawl.FacebookPost{Comments: []*fbcrawl.FacebookComment{}}
269
+ commentPaging := 0
192
270
  collector.OnHTML("#m_story_permalink_view", func(element *colly.HTMLElement) {
193
271
  dataElement := element.DOM.Find("div[data-ft]")
194
272
  if dataElement.Length() > 0 {
195
- var result map[string]string
273
+ var result FbDataFt
196
274
  jsonData, isExist := dataElement.Attr("data-ft")
197
275
  if isExist {
198
- json.Unmarshal([]byte(jsonData), &result)
276
+ logger.Info(jsonData)
277
+ err = json.Unmarshal([]byte(jsonData), &result)
278
+ if err != nil {
279
+ logger.Error(err)
280
+ return
281
+ }
199
282
  logger.Info("Post ", result)
200
- post.Id = result["top_level_post_id"]
201
- post.Group = &fbcrawl.FacebookGroup{Id: result["page_id"], Name: dataElement.Find("h3 strong:last-child a").Text()}
202
- post.User = &fbcrawl.FacebookUser{Id: result["content_owner_id_new"], Name: dataElement.Find("h3 strong:first-child a").Text()}
283
+ post.Id = result.TopLevelPostId
284
+ post.Group = &fbcrawl.FacebookGroup{Id: result.PageId, Name: dataElement.Find("h3 strong:last-child a").Text()}
285
+ post.User = &fbcrawl.FacebookUser{
286
+ Id: result.ContentOwnerIdNew,
287
+ Name: dataElement.Find("h3 strong:first-child a").Text(),
288
+ }
289
+ post.CreatedAt = result.PageInsights[strconv.FormatInt(result.PageId, 10)].PublishTime
203
290
  //Content
291
+
292
+ //NO BACKGROUND TEXT ONLY
204
293
  post.Content = strings.Join(dataElement.Find("p").Map(func(i int, selection *goquery.Selection) string {
205
294
  return selection.Text()
206
295
  }), "\n")
207
296
 
297
+ if len(post.Content) == 0 {
298
+ // TEXT WITH BACKGROUND
299
+ post.Content = dataElement.Find("div[style*=\"background-image:url\"]").Text()
300
+ }
301
+
302
+ post.ContentLink = getUrlFromRedirectHref(dataElement.Find("a[href*=\"https://lm.facebook.com/l.php\"]").AttrOr("href", ""))
303
+
304
+ post.ContentImages = (funk.Map(result.PhotoAttachmentsList, func(id string) *fbcrawl.FacebookImage {
305
+ i, _ := strconv.ParseInt(id, 10, 64)
306
+ return &fbcrawl.FacebookImage{
307
+ Id: i,
308
+ }
309
+ })).([]*fbcrawl.FacebookImage)
310
+
311
+ if result.PhotoId > 0 {
312
+ post.ContentImage = &fbcrawl.FacebookImage{Id: result.PhotoId}
313
+ }
314
+
208
315
  logger.Info("content", strings.Join(dataElement.Find("p").Map(func(i int, selection *goquery.Selection) string {
209
316
  return selection.Text()
210
317
  }), "\n"))
211
318
  }
212
- post.Comments = []*fbcrawl.FacebookComment{}
319
+
213
320
  //Comment
214
321
  element.DOM.Find("h3 + div + div + div").Parent().Parent().Each(func(i int, selection *goquery.Selection) {
215
322
  //author
216
- commentId := selection.AttrOr("id", "")
323
+ commentId, _ := strconv.ParseInt(selection.AttrOr("id", ""), 10, 64)
217
324
  logger.Info("comment", commentId)
218
- //idRegex, _ := regexp.Compile("")
219
- //idRegex.FindString()
325
+ createdAtWhenResult, _ := f.w.Parse(selection.Find("abbr").Text(), time.Now())
220
326
  post.Comments = append(post.Comments, &fbcrawl.FacebookComment{
221
327
  Id: commentId,
222
328
  Post: &fbcrawl.FacebookPost{Id: post.Id},
223
329
  User: &fbcrawl.FacebookUser{
224
- Id: regexp.MustCompile("/([\\d\\w.]+)").FindStringSubmatch(selection.Find("h3 > a").AttrOr("href", ""))[1],
330
+ Id: getUserIdFromCommentHref(selection.Find("a[href*=\"#comment_form_\"]").AttrOr("href", "")),
225
331
  Name: selection.Find("h3 > a").Text(),
226
332
  },
227
- Content: selection.Find("h3 + div").Text(),
333
+ Content: selection.Find("h3 + div").Text(),
334
+ CreatedAt: createdAtWhenResult.Time.Unix(),
228
335
  })
229
336
  })
337
+
338
+ }
339
+ })
340
+
341
+ collector.OnHTML("div[id*=\"see_prev_\"] > a", func(element *colly.HTMLElement) {
342
+ if commentPaging < 3 {
343
+ logger.Info("Comment paging", commentPaging)
344
+ err = collector.Visit("http://mbasic.facebook.com" + element.Attr("href"))
345
+ commentPaging = commentPaging + 1
230
346
  }
231
347
  })
232
- collector.Visit(fmt.Sprintf("http://mbasic.facebook.com/groups/%s?view=permalink&id=%s&_rdr", groupId, postId))
348
+
349
+ err = collector.Visit(fmt.Sprintf("http://mbasic.facebook.com/groups/%d?view=permalink&id=%d&_rdr", groupId, postId))
233
350
  return err, post
234
351
  }
352
+
353
+ func (f *Fbcolly) LoginWithCookies(cookies string) error {
354
+ collector := f.collector
355
+ return collector.SetCookies("https://mbasic.facebook.com/", storage.UnstringifyCookies(cookies))
356
+ }
357
+
358
+ //func getUsernameFromHref(href string) string {
359
+ // return regexp.MustCompile("/([\\d\\w.]+).*").FindStringSubmatch(href)[1]
360
+ //}
361
+
362
+ func getUserIdFromCommentHref(href string) int64 {
363
+ id, _ := strconv.ParseInt(regexp.MustCompile("#comment_form_(\\d+)").FindStringSubmatch(href)[1], 10, 64)
364
+ return id
365
+ }
366
+
367
+ func getUrlFromRedirectHref(href string) string {
368
+ u, _ := url.Parse(href)
369
+ return u.Query().Get("u")
370
+ }
371
+
372
+ func getImageIdFromHref(href string) int64 {
373
+ u, _ := url.Parse(href)
374
+ i, _ := strconv.ParseInt(u.Query().Get("fbid"), 10, 64)
375
+ return i
376
+ }
@@ -1,4 +1,4 @@
1
- require_relative 'lib/fbcrawl-colly/version'
1
+ require_relative 'lib/fbcrawl_colly/version'
2
2
 
3
3
  Gem::Specification.new do |spec|
4
4
  spec.name = "fbcrawl-colly"
@@ -32,4 +32,5 @@ Gem::Specification.new do |spec|
32
32
 
33
33
  spec.add_runtime_dependency 'ffi'
34
34
  spec.add_runtime_dependency 'google-protobuf'
35
+ spec.add_development_dependency 'rake-compiler'
35
36
  end
@@ -1,33 +1,48 @@
1
1
  syntax = "proto3";
2
2
 
3
+ package fbcrawl_colly;
3
4
  option go_package = "./fbcrawl;fbcrawl";
4
5
 
5
6
  // The request message containing the user's name.
6
7
  message FacebookGroup {
7
- string id = 1;
8
+ int64 id = 1;
8
9
  string name = 2;
9
10
  }
10
11
 
11
12
  message FacebookUser {
12
- string id = 1;
13
+ int64 id = 1;
13
14
  string name = 2;
14
15
  }
15
16
 
16
17
  message FacebookPost {
17
- string id = 1;
18
+ int64 id = 1;
18
19
  FacebookGroup group = 2;
19
20
  FacebookUser user = 3;
20
21
  string content = 4;
22
+ string content_link = 6;
23
+ FacebookImage content_image = 8;
24
+ repeated FacebookImage content_images = 7;
21
25
  repeated FacebookComment comments = 5;
26
+ int64 created_at = 9;
27
+ }
28
+
29
+ message FacebookImage {
30
+ int64 id = 1;
31
+ string url = 2;
22
32
  }
23
33
 
24
34
  message FacebookComment {
25
- string id = 1;
35
+ int64 id = 1;
26
36
  FacebookPost post = 2;
27
37
  FacebookUser user = 3;
28
38
  string content = 4;
39
+ int64 created_at = 5;
29
40
  }
30
41
 
31
42
  message FacebookPostList {
32
43
  repeated FacebookPost posts = 1;
33
44
  }
45
+
46
+ message FacebookImageList {
47
+ repeated FacebookImage images = 1;
48
+ }
data/go.mod CHANGED
@@ -13,6 +13,7 @@ require (
13
13
  github.com/golang/protobuf v1.4.2
14
14
  github.com/google/logger v1.1.0
15
15
  github.com/kennygrant/sanitize v1.2.4 // indirect
16
+ github.com/olebedev/when v0.0.0-20190311101825-c3b538a97254
16
17
  github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca // indirect
17
18
  github.com/temoto/robotstxt v1.1.1 // indirect
18
19
  github.com/thoas/go-funk v0.7.0
data/go.sum CHANGED
@@ -1,4 +1,6 @@
1
1
  cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
2
+ github.com/AlekSi/pointer v1.0.0 h1:KWCWzsvFxNLcmM5XmiqHsGTTsuwZMsLFwWF9Y+//bNE=
3
+ github.com/AlekSi/pointer v1.0.0/go.mod h1:1kjywbfcPFCmncIxtk6fIEub6LKrfMz3gc5QKVOSOA8=
2
4
  github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
3
5
  github.com/PuerkitoBio/goquery v1.5.1 h1:PSPBGne8NIUWw+/7vFBV+kG2J/5MOjbzc7154OaKCSE=
4
6
  github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc=
@@ -18,6 +20,7 @@ github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA
18
20
  github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
19
21
  github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8=
20
22
  github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
23
+ github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
21
24
  github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
22
25
  github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c=
23
26
  github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y=
@@ -50,6 +53,10 @@ github.com/google/logger v1.1.0 h1:saB74Etb4EAJNH3z74CVbCKk75hld/8T0CsXKetWCwM=
50
53
  github.com/google/logger v1.1.0/go.mod h1:w7O8nrRr0xufejBlQMI83MXqRusvREoJdaAxV+CoAB4=
51
54
  github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8NzpJ3o=
52
55
  github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak=
56
+ github.com/olebedev/when v0.0.0-20190311101825-c3b538a97254 h1:JYoQR67E1vv1WGoeW8DkdFs7vrIEe/5wP+qJItd5tUE=
57
+ github.com/olebedev/when v0.0.0-20190311101825-c3b538a97254/go.mod h1:DPucAeQGDPUzYUt+NaWw6qsF5SFapWWToxEiVDh2aV0=
58
+ github.com/pkg/errors v0.8.1 h1:iURUrRGxPUNPdy5/HRSm+Yj6okJ6UtLINN0Q9M4+h3I=
59
+ github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
53
60
  github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
54
61
  github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
55
62
  github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
@@ -1,16 +1,4 @@
1
- require 'ffi'
2
- require 'fbcrawl_pb'
3
- module FbcrawlColly
4
- extend FFI::Library
5
1
 
6
- ffi_lib File.expand_path("../ext/fbcrawl_colly/fbcolly.so", File.dirname(__FILE__))
7
- attach_function :free, [ :pointer ], :void
2
+ module FbcrawlColly
8
3
 
9
- attach_function :Init, [], :pointer
10
- attach_function :Login, [:pointer, :string, :string], :void
11
- attach_function :FetchGroupFeed, [:pointer, :string], :string
12
- attach_function :FetchPost, [:pointer, :string, :string], :string
13
- # attach_function :FetchGroup, [:pointer, :string], :pointer
14
- # attach_function :Login, [:pointer, :string, :string, :string], :void
15
- # attach_function :FreePointer, [:pointer], :void
16
4
  end
@@ -0,0 +1,50 @@
1
+ require 'ffi'
2
+ require_relative '../fbcrawl_pb'
3
+ require_relative './ffi'
4
+
5
+ module FbcrawlColly
6
+ class Colly
7
+ def initialize
8
+ super
9
+ @colly = ::FFI::AutoPointer.new(FbcrawlColly::FFI::Init(), FbcrawlColly::FFI.method(:FreeColly))
10
+ end
11
+
12
+ def login(email, password)
13
+ s, ptr = FbcrawlColly::FFI.Login(@colly, email, password)
14
+ FbcrawlColly::FFI.free(ptr)
15
+ s
16
+ end
17
+
18
+ def login_with_cookies(cookies)
19
+ FbcrawlColly::FFI.LoginWithCookies(@colly, cookies)
20
+ end
21
+
22
+ def fetch_group_feed(group_id)
23
+ s, ptr = FbcrawlColly::FFI.FetchGroupFeed(@colly, group_id)
24
+ list = FbcrawlColly::FacebookPostList.decode(s)
25
+ FbcrawlColly::FFI.free(ptr)
26
+ list
27
+ end
28
+
29
+ def fetch_post(group_id, post_id)
30
+ s, ptr = FbcrawlColly::FFI.FetchPost(@colly, group_id, post_id)
31
+ post = FbcrawlColly::FacebookPost.decode(s)
32
+ FbcrawlColly::FFI.free(ptr)
33
+ post
34
+ end
35
+
36
+ def fetch_content_images(post_id)
37
+ s, ptr = FbcrawlColly::FFI.FetchContentImages(@colly, post_id)
38
+ imageList = FbcrawlColly::FacebookImageList.decode(s)
39
+ FbcrawlColly::FFI.free(ptr)
40
+ imageList
41
+ end
42
+
43
+ def fetch_image_url(image_id)
44
+ s, ptr = FbcrawlColly::FFI.FetchImageUrl(@colly, image_id)
45
+ image = FbcrawlColly::FacebookImage.decode(s)
46
+ FbcrawlColly::FFI.free(ptr)
47
+ image
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,17 @@
1
+ require 'ffi'
2
+ module FbcrawlColly::FFI
3
+ extend FFI::Library
4
+
5
+ ffi_lib File.expand_path("../../ext/fbcrawl_colly/fbcolly.so", File.dirname(__FILE__))
6
+ attach_function :free, [ :pointer ], :void
7
+
8
+ attach_function :Init, [], :pointer
9
+ attach_function :FreeColly, [:pointer], :pointer
10
+ attach_function :Login, [:pointer, :string, :string], :strptr
11
+ attach_function :LoginWithCookies, [:pointer, :string], :void
12
+ attach_function :FetchGroupFeed, [:pointer, :int64], :strptr
13
+ attach_function :FetchPost, [:pointer, :int64, :int64], :strptr
14
+ attach_function :FetchContentImages, [:pointer, :int64], :strptr
15
+ attach_function :FetchImageUrl, [:pointer, :int64], :strptr
16
+ # attach_function :FetchGroup, [:pointer, :string], :pointer
17
+ end
@@ -1,3 +1,3 @@
1
1
  module FbcrawlColly
2
- VERSION = "0.1.0"
2
+ VERSION = "0.2.0"
3
3
  end
data/main.go CHANGED
@@ -27,36 +27,69 @@ var password = flag.String("password", "change_me", "facebook password")
27
27
  var otp = flag.String("otp", "123456", "facebook otp")
28
28
  var groupId = flag.String("groupId", "334294967318328", "facebook group id, default is 334294967318328")
29
29
 
30
- var tmp = fbcolly.New()
30
+ var allInstances = map[uintptr]*fbcolly.Fbcolly{}
31
31
 
32
32
  //export Init
33
33
  func Init() uintptr {
34
- return (uintptr)(unsafe.Pointer(tmp))
34
+ instance := fbcolly.New()
35
+ ptr := (uintptr)(unsafe.Pointer(instance))
36
+ allInstances[ptr] = instance
37
+ return ptr
38
+ }
39
+
40
+ //export FreeColly
41
+ func FreeColly(pointer unsafe.Pointer) {
42
+ delete(allInstances, uintptr(pointer))
35
43
  }
36
44
 
37
45
  //export Login
38
- func Login(pointer unsafe.Pointer, email *C.char, password *C.char) {
46
+ func Login(pointer unsafe.Pointer, email *C.char, password *C.char) *C.char {
47
+ p := (*fbcolly.Fbcolly)(pointer)
48
+ cookies, err := p.Login(C.GoString(email), C.GoString(password), "")
49
+ if err == nil {
50
+ return C.CString(cookies)
51
+ }
52
+ return nil
53
+ }
54
+
55
+ //export LoginWithCookies
56
+ func LoginWithCookies(pointer unsafe.Pointer, cookies *C.char) {
39
57
  p := (*fbcolly.Fbcolly)(pointer)
40
- //print(p.E)
41
- p.Login(C.GoString(email), C.GoString(password), "")
58
+ p.LoginWithCookies(C.GoString(cookies))
42
59
  }
43
60
 
44
61
  //export FetchGroupFeed
45
- func FetchGroupFeed(pointer unsafe.Pointer, groupId *C.char) unsafe.Pointer {
62
+ func FetchGroupFeed(pointer unsafe.Pointer, groupId int64) unsafe.Pointer {
46
63
  p := (*fbcolly.Fbcolly)(pointer)
47
- _, postsList := p.FetchGroupFeed(C.GoString(groupId))
64
+ _, postsList := p.FetchGroupFeed(groupId)
48
65
  marshaledPostsList, _ := proto.Marshal(postsList)
49
66
  return C.CBytes(append(marshaledPostsList, 0))
50
67
  }
51
68
 
52
69
  //export FetchPost
53
- func FetchPost(pointer unsafe.Pointer, groupId *C.char, postId *C.char) unsafe.Pointer {
70
+ func FetchPost(pointer unsafe.Pointer, groupId int64, postId int64) unsafe.Pointer {
54
71
  p := (*fbcolly.Fbcolly)(pointer)
55
- _, post := p.FetchPost(C.GoString(groupId), C.GoString(postId))
72
+ _, post := p.FetchPost(groupId, postId)
56
73
  marshaledPost, _ := proto.Marshal(post)
57
74
  return C.CBytes(append(marshaledPost, 0))
58
75
  }
59
76
 
77
+ //export FetchContentImages
78
+ func FetchContentImages(pointer unsafe.Pointer, postId int64) unsafe.Pointer {
79
+ p := (*fbcolly.Fbcolly)(pointer)
80
+ _, imageList := p.FetchContentImages(postId)
81
+ marshaled, _ := proto.Marshal(imageList)
82
+ return C.CBytes(append(marshaled, 0))
83
+ }
84
+
85
+ //export FetchImageUrl
86
+ func FetchImageUrl(pointer unsafe.Pointer, imageId int64) unsafe.Pointer {
87
+ p := (*fbcolly.Fbcolly)(pointer)
88
+ _, image := p.FetchImageUrl(imageId)
89
+ marshaled, _ := proto.Marshal(image)
90
+ return C.CBytes(append(marshaled, 0))
91
+ }
92
+
60
93
  func main() {
61
94
  //r := regexp.MustCompile("/([\\d\\w.]+)").FindStringSubmatch()[1]
62
95
  //print(r.FindStringSubmatch("/liem.phamthanh.161?refid=18&__tn__=R")[1])
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: fbcrawl-colly
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Duy Le
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-08-05 00:00:00.000000000 Z
11
+ date: 2020-08-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ffi
@@ -38,6 +38,20 @@ dependencies:
38
38
  - - ">="
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake-compiler
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
41
55
  description: Crawl mbasic.facebook.com using GO Colly
42
56
  email:
43
57
  - duyleekun@gmail.com
@@ -47,9 +61,9 @@ extensions:
47
61
  extra_rdoc_files: []
48
62
  files:
49
63
  - ".gitignore"
50
- - ".travis.yml"
51
64
  - CODE_OF_CONDUCT.md
52
65
  - Gemfile
66
+ - Gemfile.lock
53
67
  - LICENSE.txt
54
68
  - README.md
55
69
  - Rakefile
@@ -64,7 +78,9 @@ files:
64
78
  - go.mod
65
79
  - go.sum
66
80
  - lib/fbcrawl-colly.rb
67
- - lib/fbcrawl-colly/version.rb
81
+ - lib/fbcrawl_colly/colly.rb
82
+ - lib/fbcrawl_colly/ffi.rb
83
+ - lib/fbcrawl_colly/version.rb
68
84
  - main.go
69
85
  homepage: http://github.com/duyleekun/fbcrawl-colly
70
86
  licenses:
@@ -1,6 +0,0 @@
1
- ---
2
- language: ruby
3
- cache: bundler
4
- rvm:
5
- - 2.7.1
6
- before_install: gem install bundler -v 2.1.4